Bioinformatics tutorials: 2014

Exported from Notepad++
#!/usr/bin/perl

use strict;

#
# set to 1 for lots of debugging output
#
use constant DEBUG => 0;

#
#  minimum size of ORF to output
#
use constant MINSIZE => 20;

#
# hash to make complement sequence
#
my %compnuc = ( 'A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C',
        '-' => '-', ' ' => ' ', '*' => '*', '.' => '.');

#
# hash for translating DNA
#
my %DNAtoAA = (
        'GCT' => 'A', 'GCC' => 'A', 'GCA' => 'A', 'GCG' => 'A',
        'TGT' => 'C', 'TGC' => 'C', 'GAT' => 'D', 'GAC' => 'D',
        'GAA' => 'E', 'GAG' => 'E', 'TTT' => 'F', 'TTC' => 'F',
        'GGT' => 'G', 'GGC' => 'G', 'GGA' => 'G', 'GGG' => 'G',
        'CAT' => 'H', 'CAC' => 'H', 'ATT' => 'I', 'ATC' => 'I',
        'ATA' => 'I', 'AAA' => 'K', 'AAG' => 'K', 'TTG' => 'L',
        'TTA' => 'L', 'CTT' => 'L', 'CTC' => 'L', 'CTA' => 'L',
        'CTG' => 'L', 'ATG' => 'M', 'AAT' => 'N', 'AAC' => 'N',
        'CCT' => 'P', 'CCC' => 'P', 'CCA' => 'P', 'CCG' => 'P',
        'CAA' => 'Q', 'CAG' => 'Q', 'CGT' => 'R', 'CGC' => 'R',
        'CGA' => 'R', 'CGG' => 'R', 'AGA' => 'R', 'AGG' => 'R',
        'TCT' => 'S', 'TCC' => 'S', 'TCA' => 'S', 'TCG' => 'S',
        'AGT' => 'S', 'AGC' => 'S', 'ACT' => 'T', 'ACC' => 'T',
        'ACA' => 'T', 'ACG' => 'T', 'GTT' => 'V', 'GTC' => 'V',
        'GTA' => 'V', 'GTG' => 'V', 'TGG' => 'W', 'TAT' => 'Y',
        'TAC' => 'Y', 'TAA' => ':', 'TAG' => ':', 'TGA' => ':');

#
# program will open FASTA files end with *.fa
# 

print "Enter your file name with .fa extension\n";
my $file=<STDIN>;
chomp($file);

my $seq = "";
my $chr = $file;
$chr =~ s/\.fa//;
$chr =~ s/\.\///;

open (FILE, $file) || die "cannot open for reading $file: $!\n";

#
# reading FASTA formated files, read each line and append 
# it to the previous.
# 
# ignore the def line
#
# this assumes that each chromosome is in a file by itself
#   
    while (<FILE>) {
    chop;
    if ($_ !~ /^>/) {
        $seq .= $_;
    }
    }
    print STDERR "length of DNA is " . length($seq) . "\n";
    close (FILE);

#
# Make reverse complement
#

    my $revseq = "";
    my $revtmp = reverse $seq;

    for (my $j = 0; $j < length($revtmp); $j++) {
    $revseq .= $compnuc{ substr($revtmp, $j, 1) };
    }

#
# Make translations
#

    my $pept = "";
    my $start = 0;
    my $stop = 0;

    open (FILE, ">$chr.peptides.fa") || die "cannot open file $!\n";

#
# $i loop designates which strand, 0 = Watson and 1 = Crick
#
    for (my $i = 0; $i < 2; $i++) {
    if ($i == 1) {
        $seq = $revseq;
    }
#
# $j is for reading frame, 1, 2 or 3
#
    for (my $j = 0; $j < 3; $j++) {
        print STDERR "FRAME " . (($j + 1) + ($i * 3)) . " \n";
        if (DEBUG) { print STDERR "\n*"; }
#
# translation loop
# $x loop is for walking throught the framing frame looking for a peptide
#
        for (my $x = $j; $x < length($seq); $x += 3) {
        if (DEBUG) { print STDERR "= $x "; }
#
# Beginning of translation loop
# Only find peptides that start with ATG (methonine)
#
        if ( $DNAtoAA{ substr($seq, $x, 3) } eq "M" ) {
            $start = $x;
            $stop = $x + 3;
#
# Translate until a STOP is found (':') or the end of the sequence is
# reached $y loop is used after a start has been found, now we
# translate.  The loop starts with a stop is found ':' or $y is larger
# than the length of the sequence minus 3.  This is so we always have
# a full codon.
#

            for (my $y = $x; ($DNAtoAA{ substr($seq, $y, 3) } ne ":") 
                                && ($y < (length($seq) - 3)); $y += 3) {

            $pept .= $DNAtoAA{ substr($seq, $y, 3) };
            $stop = $y + 3;
            if (DEBUG) { print STDERR "\n+ " . (length($seq) - $start) . 
                        " -> " . (length($seq) - ($stop + 2)) . " = $pept"; }
            }
            $x = $stop;
#
# if the peptide is 20 amino acids or longer write it to a FASTA file with 80 char lines
#
            if (length($pept) > (MINSIZE - 1)) {
            my $peptcoord = $chr;
            $peptcoord =~ s/chr//;
            if ($i == 0) {
                $peptcoord .= ":" . ($start + 1) . ":" . ($stop + 3);
            } else {
                $peptcoord .= ":" . (length($seq) - $start) . ":" . (length($seq) - ($stop + 2));
            }
print FILE ">ORF|" . $peptcoord . "| frame " . (($j + 1) + ($i * 3)). "\n";
#
# output sequence in 80 char lines, while processes all sequences larger than 80 characters
#  in length
#
            while (length($pept) > 80) {
                print FILE substr($pept, 0, 80) . "\n";
                $pept = substr($pept, 80, length($pept));
                if (DEBUG) { print STDERR "-"; }
            }
#
# print last bit of $pept
#
            print FILE $pept . "\n";
            if (DEBUG) { print STDERR "."; }
            }
            $pept = "";
        }
        }
    }
    }

#
# Close chromosome sequence file
#
    close (FILE); 

print STDERR "\n\n";

exit;
Bioinformatics tutorials

Thursday, November 6, 2014

What is principal component analysis?

Tuesday, July 22, 2014

You’re not allowed bioinformatics anymore

Monday, March 31, 2014

Perl Program to find Open Reading Frames (ORFs) in DNA sequence and then converting into protein

Thursday, February 27, 2014

Confused About Distribution....

Tuesday, January 28, 2014

pbc wrap command to put protein in centre of the box in VMD

About Me