Monday, March 31, 2014

Perl Program to find Open Reading Frames (ORFs) in DNA sequence and then converting into protein


Exported from Notepad++
#!/usr/bin/perl use strict; # # set to 1 for lots of debugging output # use constant DEBUG => 0; # # minimum size of ORF to output # use constant MINSIZE => 20; # # hash to make complement sequence # my %compnuc = ( 'A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C', '-' => '-', ' ' => ' ', '*' => '*', '.' => '.'); # # hash for translating DNA # my %DNAtoAA = ( 'GCT' => 'A', 'GCC' => 'A', 'GCA' => 'A', 'GCG' => 'A', 'TGT' => 'C', 'TGC' => 'C', 'GAT' => 'D', 'GAC' => 'D', 'GAA' => 'E', 'GAG' => 'E', 'TTT' => 'F', 'TTC' => 'F', 'GGT' => 'G', 'GGC' => 'G', 'GGA' => 'G', 'GGG' => 'G', 'CAT' => 'H', 'CAC' => 'H', 'ATT' => 'I', 'ATC' => 'I', 'ATA' => 'I', 'AAA' => 'K', 'AAG' => 'K', 'TTG' => 'L', 'TTA' => 'L', 'CTT' => 'L', 'CTC' => 'L', 'CTA' => 'L', 'CTG' => 'L', 'ATG' => 'M', 'AAT' => 'N', 'AAC' => 'N', 'CCT' => 'P', 'CCC' => 'P', 'CCA' => 'P', 'CCG' => 'P', 'CAA' => 'Q', 'CAG' => 'Q', 'CGT' => 'R', 'CGC' => 'R', 'CGA' => 'R', 'CGG' => 'R', 'AGA' => 'R', 'AGG' => 'R', 'TCT' => 'S', 'TCC' => 'S', 'TCA' => 'S', 'TCG' => 'S', 'AGT' => 'S', 'AGC' => 'S', 'ACT' => 'T', 'ACC' => 'T', 'ACA' => 'T', 'ACG' => 'T', 'GTT' => 'V', 'GTC' => 'V', 'GTA' => 'V', 'GTG' => 'V', 'TGG' => 'W', 'TAT' => 'Y', 'TAC' => 'Y', 'TAA' => ':', 'TAG' => ':', 'TGA' => ':'); # # program will open FASTA files end with *.fa # print "Enter your file name with .fa extension\n"; my $file=<STDIN>; chomp($file); my $seq = ""; my $chr = $file; $chr =~ s/\.fa//; $chr =~ s/\.\///; open (FILE, $file) || die "cannot open for reading $file: $!\n"; # # reading FASTA formated files, read each line and append # it to the previous. # # ignore the def line # # this assumes that each chromosome is in a file by itself # while (<FILE>) { chop; if ($_ !~ /^>/) { $seq .= $_; } } print STDERR "length of DNA is " . length($seq) . "\n"; close (FILE); # # Make reverse complement # my $revseq = ""; my $revtmp = reverse $seq; for (my $j = 0; $j < length($revtmp); $j++) { $revseq .= $compnuc{ substr($revtmp, $j, 1) }; } # # Make translations # my $pept = ""; my $start = 0; my $stop = 0; open (FILE, ">$chr.peptides.fa") || die "cannot open file $!\n"; # # $i loop designates which strand, 0 = Watson and 1 = Crick # for (my $i = 0; $i < 2; $i++) { if ($i == 1) { $seq = $revseq; } # # $j is for reading frame, 1, 2 or 3 # for (my $j = 0; $j < 3; $j++) { print STDERR "FRAME " . (($j + 1) + ($i * 3)) . " \n"; if (DEBUG) { print STDERR "\n*"; } # # translation loop # $x loop is for walking throught the framing frame looking for a peptide # for (my $x = $j; $x < length($seq); $x += 3) { if (DEBUG) { print STDERR "= $x "; } # # Beginning of translation loop # Only find peptides that start with ATG (methonine) # if ( $DNAtoAA{ substr($seq, $x, 3) } eq "M" ) { $start = $x; $stop = $x + 3; # # Translate until a STOP is found (':') or the end of the sequence is # reached $y loop is used after a start has been found, now we # translate. The loop starts with a stop is found ':' or $y is larger # than the length of the sequence minus 3. This is so we always have # a full codon. # for (my $y = $x; ($DNAtoAA{ substr($seq, $y, 3) } ne ":") && ($y < (length($seq) - 3)); $y += 3) { $pept .= $DNAtoAA{ substr($seq, $y, 3) }; $stop = $y + 3; if (DEBUG) { print STDERR "\n+ " . (length($seq) - $start) . " -> " . (length($seq) - ($stop + 2)) . " = $pept"; } } $x = $stop; # # if the peptide is 20 amino acids or longer write it to a FASTA file with 80 char lines # if (length($pept) > (MINSIZE - 1)) { my $peptcoord = $chr; $peptcoord =~ s/chr//; if ($i == 0) { $peptcoord .= ":" . ($start + 1) . ":" . ($stop + 3); } else { $peptcoord .= ":" . (length($seq) - $start) . ":" . (length($seq) - ($stop + 2)); } print FILE ">ORF|" . $peptcoord . "| frame " . (($j + 1) + ($i * 3)). "\n"; # # output sequence in 80 char lines, while processes all sequences larger than 80 characters # in length # while (length($pept) > 80) { print FILE substr($pept, 0, 80) . "\n"; $pept = substr($pept, 80, length($pept)); if (DEBUG) { print STDERR "-"; } } # # print last bit of $pept # print FILE $pept . "\n"; if (DEBUG) { print STDERR "."; } } $pept = ""; } } } } # # Close chromosome sequence file # close (FILE); print STDERR "\n\n"; exit;

No comments:

Post a Comment