Bioinformatics tutorials: Perl Program to find Open Reading Frames (ORFs) in DNA sequence and then converting into protein

Exported from Notepad++
#!/usr/bin/perl

use strict;

#
# set to 1 for lots of debugging output
#
use constant DEBUG => 0;

#
#  minimum size of ORF to output
#
use constant MINSIZE => 20;

#
# hash to make complement sequence
#
my %compnuc = ( 'A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C',
        '-' => '-', ' ' => ' ', '*' => '*', '.' => '.');

#
# hash for translating DNA
#
my %DNAtoAA = (
        'GCT' => 'A', 'GCC' => 'A', 'GCA' => 'A', 'GCG' => 'A',
        'TGT' => 'C', 'TGC' => 'C', 'GAT' => 'D', 'GAC' => 'D',
        'GAA' => 'E', 'GAG' => 'E', 'TTT' => 'F', 'TTC' => 'F',
        'GGT' => 'G', 'GGC' => 'G', 'GGA' => 'G', 'GGG' => 'G',
        'CAT' => 'H', 'CAC' => 'H', 'ATT' => 'I', 'ATC' => 'I',
        'ATA' => 'I', 'AAA' => 'K', 'AAG' => 'K', 'TTG' => 'L',
        'TTA' => 'L', 'CTT' => 'L', 'CTC' => 'L', 'CTA' => 'L',
        'CTG' => 'L', 'ATG' => 'M', 'AAT' => 'N', 'AAC' => 'N',
        'CCT' => 'P', 'CCC' => 'P', 'CCA' => 'P', 'CCG' => 'P',
        'CAA' => 'Q', 'CAG' => 'Q', 'CGT' => 'R', 'CGC' => 'R',
        'CGA' => 'R', 'CGG' => 'R', 'AGA' => 'R', 'AGG' => 'R',
        'TCT' => 'S', 'TCC' => 'S', 'TCA' => 'S', 'TCG' => 'S',
        'AGT' => 'S', 'AGC' => 'S', 'ACT' => 'T', 'ACC' => 'T',
        'ACA' => 'T', 'ACG' => 'T', 'GTT' => 'V', 'GTC' => 'V',
        'GTA' => 'V', 'GTG' => 'V', 'TGG' => 'W', 'TAT' => 'Y',
        'TAC' => 'Y', 'TAA' => ':', 'TAG' => ':', 'TGA' => ':');

#
# program will open FASTA files end with *.fa
# 

print "Enter your file name with .fa extension\n";
my $file=<STDIN>;
chomp($file);

my $seq = "";
my $chr = $file;
$chr =~ s/\.fa//;
$chr =~ s/\.\///;

open (FILE, $file) || die "cannot open for reading $file: $!\n";

#
# reading FASTA formated files, read each line and append 
# it to the previous.
# 
# ignore the def line
#
# this assumes that each chromosome is in a file by itself
#   
    while (<FILE>) {
    chop;
    if ($_ !~ /^>/) {
        $seq .= $_;
    }
    }
    print STDERR "length of DNA is " . length($seq) . "\n";
    close (FILE);

#
# Make reverse complement
#

    my $revseq = "";
    my $revtmp = reverse $seq;

    for (my $j = 0; $j < length($revtmp); $j++) {
    $revseq .= $compnuc{ substr($revtmp, $j, 1) };
    }

#
# Make translations
#

    my $pept = "";
    my $start = 0;
    my $stop = 0;

    open (FILE, ">$chr.peptides.fa") || die "cannot open file $!\n";

#
# $i loop designates which strand, 0 = Watson and 1 = Crick
#
    for (my $i = 0; $i < 2; $i++) {
    if ($i == 1) {
        $seq = $revseq;
    }
#
# $j is for reading frame, 1, 2 or 3
#
    for (my $j = 0; $j < 3; $j++) {
        print STDERR "FRAME " . (($j + 1) + ($i * 3)) . " \n";
        if (DEBUG) { print STDERR "\n*"; }
#
# translation loop
# $x loop is for walking throught the framing frame looking for a peptide
#
        for (my $x = $j; $x < length($seq); $x += 3) {
        if (DEBUG) { print STDERR "= $x "; }
#
# Beginning of translation loop
# Only find peptides that start with ATG (methonine)
#
        if ( $DNAtoAA{ substr($seq, $x, 3) } eq "M" ) {
            $start = $x;
            $stop = $x + 3;
#
# Translate until a STOP is found (':') or the end of the sequence is
# reached $y loop is used after a start has been found, now we
# translate.  The loop starts with a stop is found ':' or $y is larger
# than the length of the sequence minus 3.  This is so we always have
# a full codon.
#

            for (my $y = $x; ($DNAtoAA{ substr($seq, $y, 3) } ne ":") 
                                && ($y < (length($seq) - 3)); $y += 3) {

            $pept .= $DNAtoAA{ substr($seq, $y, 3) };
            $stop = $y + 3;
            if (DEBUG) { print STDERR "\n+ " . (length($seq) - $start) . 
                        " -> " . (length($seq) - ($stop + 2)) . " = $pept"; }
            }
            $x = $stop;
#
# if the peptide is 20 amino acids or longer write it to a FASTA file with 80 char lines
#
            if (length($pept) > (MINSIZE - 1)) {
            my $peptcoord = $chr;
            $peptcoord =~ s/chr//;
            if ($i == 0) {
                $peptcoord .= ":" . ($start + 1) . ":" . ($stop + 3);
            } else {
                $peptcoord .= ":" . (length($seq) - $start) . ":" . (length($seq) - ($stop + 2));
            }
print FILE ">ORF|" . $peptcoord . "| frame " . (($j + 1) + ($i * 3)). "\n";
#
# output sequence in 80 char lines, while processes all sequences larger than 80 characters
#  in length
#
            while (length($pept) > 80) {
                print FILE substr($pept, 0, 80) . "\n";
                $pept = substr($pept, 80, length($pept));
                if (DEBUG) { print STDERR "-"; }
            }
#
# print last bit of $pept
#
            print FILE $pept . "\n";
            if (DEBUG) { print STDERR "."; }
            }
            $pept = "";
        }
        }
    }
    }

#
# Close chromosome sequence file
#
    close (FILE); 

print STDERR "\n\n";

exit;
Bioinformatics tutorials

Monday, March 31, 2014

Perl Program to find Open Reading Frames (ORFs) in DNA sequence and then converting into protein

No comments:

Post a Comment

About Me