Thursday, November 28, 2013

awk command examples

For a bioinformatician pattern searching is everyday routine... And I feel one should use the power of shell oneline commands which are easy and make life easier... So here I am giving some basic examples of awk's commands which are simple and can be used very frequently...

Print only next line after the pattern matched...

awk '/pattern/{getline; print}' filename

This command first match the pattern and then print next line... this command can also be extended to print specific line after pattern match... for example if one want to print third line after pattern match just write...

awk -v lines=3 '/pattern/ {for(i=lines;i;--i)getline; print}' filename

If one one want to extract only one column from that line then simply add column index after print for example to print second column of next line after pattern match... just write the command...

awk '/pattern/{getline; print $2}' filename

$0 to print complete line
$1 to print first column
$2 to print second column and so on

If one want to calculate sum or average of a particular column then one command can do all for you...
for example one want to calculate sum or average of second column then write...

awk '{sum+=$2} END { print "Sum = ",sum}' filename
awk '{sum+=$2} END { print "Average = ",sum/NR}' filename


Friday, June 28, 2013

Trajectory file conversion using wordom


wordom -conv -imol mymolecule.pdb -omol mymolecule.crd

wordom -conv -imol mymolecule.pdb -sele "/A/@(20-100)/*" -omol mymolecule.crd

wordom -conv -itrj mytrj.dcd -imol mymolecule.pdb -otrj mytrj.xtc

wordom -conv -itrj mytrj.xtc -otrj mytrj.dcd

Wednesday, June 12, 2013

MD simulation of LJ Particles (Ar) in C

Exported from Notepad++
#include <stdio.h> #include <stdlib.h> #include <math.h> // input parameters int np = 500; // number of Ar particles double density = 0.75; // density of system double temp = 1.0; // desired temperature double dt = 0.001; // time step double numstep = 1000; // number of steps for MD // constants and variables double r[500][3]; // positions double v[500][3]; // velocities double f[500][3]; // forces double L, t, temperature; // box length double ke, pe, total; // kinetic energy, potential energy, total energy int nc; // number of cells in each direction (cubic box) // functions prototype void initialize(); void calcforce(); void update(); int main () { int p, i; FILE *fh; fh = fopen("sim-box.xyz", "w"); initialize(); // writing initial configuration to file fprintf(fh, "%d\n\n", np); for (i = 0; i < np; i++) { fprintf(fh, "%s\t%f\t%f\t%f\n","Ar", r[i][0], r[i][1], r[i][2]); } calcforce(); t = 0; for (t =0; t < numstep; t++) { update(); // writing configuration at each step to file for (i = 0; i < np; i++) { fprintf(fh, "%s\t%f\t%f\t%f\n","Ar", r[i][0], r[i][1], r[i][2]); } ke = 0; for (p = 0; p < np; p++) { for (i = 0; i < 3; i++) { ke += 0.5 * v[p][i] * v[p][i]; } } total = ke + pe; printf("pe = %f ke = %f Total Energy = %f\n",pe, ke, total); } fclose(fh); } void initialize () { int i, j, k, m, n, p, nc, IX, IY, IZ, IREF, M; double L, CELL, CELL2, sumv2, sumv3, df, fs; double Sum[3] = {0.0, 0.0, 0.0}; // compute length of box L = powf( np / density, 1.0 / 3.0 ); nc = powf( np / 4, 1.0 / 3.0 ); /* CALCULATE THE SIDE OF THE UNIT CELL */ CELL = L / nc; CELL2 = 0.5 * CELL; /* BUILD THE UNIT CELL */ /* SUBLATTICE A */ r[0][0] = 0.0; r[0][1] = 0.0; r[0][2] = 0.0; /* SUBLATTICE B */ r[1][0] = 0.0; r[1][1] = CELL2; r[1][2] = CELL2; /* SUBLATTICE C */ r[2][0] = CELL2; r[2][1] = 0.0; r[2][2] = CELL2; /* SUBLATTICE D */ r[3][0] = CELL2; r[3][1] = CELL2; r[3][2] = 0.0; /* CONSTRUCT THE LATTICE FROM THE UNIT CELL */ M = 0; for( IZ=0; IZ<5; IZ++ ) for( IY=0; IY<5; IY++ ) for( IX=0; IX<5; IX++ ){ for( IREF=0; IREF<4; IREF++ ){ r[IREF+M][0] = r[IREF][0] + CELL * IX ; r[IREF+M][1] = r[IREF][1] + CELL * IY ; r[IREF+M][2] = r[IREF][2] + CELL * IZ ; } M = M + 4; } // random distributed initial velocities for (p = 0; p < np; p++) for (i = 0; i < 3; i++) { v[p][i] = (double)rand()/RAND_MAX - 0.5; Sum[i] += v[p][i]; } // with zero total momentum for (p = 0; p < np; p++) for (i = 0; i < 3; i++) v[p][i] -= Sum[i] / np; // rescaled to desired temperature for (p = 0; p < np; p++) for (i = 0; i < 3; i++) sumv2 += v[p][i] * v[p][i]; df = 3 * np - 3; fs = sqrt(temp * df / sumv2); printf("temperature before scaling= %f\n", sumv2/df); sumv3 = 0.0; for (p = 0; p < np; p++) for (i = 0; i < 3; i++) { v[p][i] *= fs; sumv3 += v[p][i] * v[p][i]; } printf("temperature after scaling= %f\n", sumv3/df); } void calcforce () { int i, j, k, sign; double r2, rij[3], mag, L; double r2i, r6i, r12i, ff, fij[3]; for (i = 0; i < np; i++) for (k = 0; k < 3; k++) f[i][k] = 0.0; pe = 0.0; L = powf(np / density, 1.0 / 3.0 ); // loop over all pairs of particles for (i = 0; i < np - 1; i++) { for (j = i + 1; j < np; j++) { r2 = 0.0; for (k = 0; k < 3; k++) { rij[k] = r[i][k] - r[j][k]; sign = rij[k] > 0.0 ? 1 : -1; mag = rij[k] * sign; if (mag > L / 2) // use nearest image rij[k] -= L * sign; r2 += rij[k] * rij[k]; } r2i = 1 / r2; r6i = r2i * r2i * r2i; r12i = r6i * r6i; ff = r2i * (48 * r12i - 24 * r6i); for (k = 0; k < 3; k++) { fij[k] = rij[k] * ff; f[i][k] += fij[k]; f[j][k] -= fij[k]; } pe += 4 * (r12i - r6i); } } } void update () { int p, k; // compute length of box L = powf( np / density, 1.0 / 3.0 ); for (p = 0; p < np; p++) for (k = 0; k < 3; k++) { r[p][k] += v[p][k] * dt + 0.5 * f[p][k] * dt * dt; // impose periodic boundary conditions if (r[p][k] < 0.0) r[p][k] += L; if (r[p][k] >= L) r[p][k] -= L; v[p][k] += 0.5 * f[p][k] * dt; } calcforce(); for (p = 0; p < np; p++) for (k = 0; k < 3; k++) v[p][k] += 0.5 * f[p][k] * dt; }

Sunday, May 12, 2013

Some important linux commands, a bioinformatician must know

Exported from Notepad++
### Some important linux commands, a bioinformatician must know ### head # to see first few lines of a file tail # to see last few lines of a file grep # to search file(s) for lines that match a pattern cut # to get few character columns of a file awk # I use this to extract columns of a tabular file sort # sort files with number or text uniq # uniquify files wc # print byte, word, and line count tr # translate, sqeeze, and/or delete character fold # to make all lines with fixed width like fatsa format ### for more info use man command ###

Sunday, April 7, 2013

Saturday, April 6, 2013

Python code to download list of pdb files

Exported from Notepad++
#!/usr/bin/python """ This simple script can be used to download pdb files directly from internet usage: python getpdb.py <pdbids> example: python getpdb.py 3ZBQ 4HXP 4HXM 4B8F """ import urllib url = "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=pdb&compression=NO&structureId=" from sys import argv for i in argv[1:]: pdbid = url+str(i) open( i+".pdb", "w" ).write( urllib.urlopen(pdbid).read() ) print i+".pdb"

Python class for basic operations dealing with DNA sequence

Exported from Notepad++
class DNA: """Class representing DNA as a string sequence.""" basecomplement = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'} codon2aa = {"TTT":"F","TTC":"F","TTA":"L","TTG":"L","TCT":"S","TCC":"S", "TCA":"S","TCG":"S","TAT":"Y","TAC":"Y","TAA":"*","TAG":"*", "TGT":"C","TGC":"C","TGA":"*","TGG":"W","CTT":"L","CTC":"L", "CTA":"L","CTG":"L","CCT":"P","CCC":"P","CCA":"P","CCG":"P", "CAT":"H","CAC":"H","CAA":"Q","CAG":"Q","CGT":"R","CGC":"R", "CGA":"R","CGG":"R","ATT":"I","ATC":"I","ATA":"I","ATG":"M", "ACT":"T","ACC":"T","ACA":"T","ACG":"T","AAT":"N","AAC":"N", "AAA":"K","AAG":"K","AGT":"S","AGC":"S","AGA":"R","AGG":"R", "GTT":"V","GTC":"V","GTA":"V","GTG":"V","GCT":"A","GCC":"A", "GCA":"A","GCG":"A","GAT":"D","GAC":"D","GAA":"E","GAG":"E", "GGT":"G","GGC":"G","GGA":"G","GGG":"G"} def __init__(self, s): """Create DNA instance initialized to string s.""" self.seq = s def transcribe(self): """Return as rna string.""" return self.seq.replace('T', 'U') def reverse(self): """Return dna string in reverse order.""" letters = list(self.seq) letters.reverse() return ''.join(letters) def complement(self): """Return the complementary dna string.""" letters = list(self.seq) letters = [self.basecomplement[base] for base in letters] return ''.join(letters) def reversecomplement(self): """Return the reverse of complement of the dna string.""" letters = list(self.seq) letters.reverse() letters = [self.basecomplement[base] for base in letters] return ''.join(letters) def gc(self): """Return the % of dna composed of G+C.""" s = self.seq gc = s.count('G') + s.count('C') return gc * 100.0 / len(s) def codons(self): """Return list of codons for the dna string,""" s = self.seq end = len(s) - (len(s) % 3) - 1 codons = [s[i:i+3] for i in range(0, end, 3)] return codons def translate(self): """Return amino acid sequence translating dna seq.""" s = self.seq codons = self.codons() aa = [self.codon2aa[aa] for aa in codons] return ''.join(aa) """ How to use this code... save this code in a file name as bio.py then open python terminal and use this as a module for example... >>> from bio import * >>> S = DNA('ATGTGCGTGCTC') >>> S <bio.DNA instance at 0x02A09B48> >>> S.translate() 'MCVL' >>> S.transcribe() 'AUGUGCGUGCUC' >>> S.reverse() 'CTCGTGCGTGTA' >>> S.complement() 'TACACGCACGAG' >>> S.codons() ['ATG', 'TGC', 'GTG', 'CTC'] >>> """

RNA to protein conversion (translation) using perl

Exported from Notepad++
# This is my first post starting with very basic code... print “Enter the RNA sequence: ; $rna = <>; chomp($rna); $rna =~s/[^acgu]//ig; my $rna = uc($rna); my(%genetic_code) = ( ‘GGA’ => ‘G’, ‘GGC’ => ‘G’, ‘GGG’ => ‘G’, ‘GGU’ => ‘G’, # Glycine ‘GCA’ => ‘A’, ‘GCC’ => ‘A’, ‘GCG’ => ‘A’, ‘GCU’ => ‘A’, # Alanine ‘GUA’ => ‘V’, ‘GUC’ => ‘V’, ‘GUG’ => ‘V’, ‘GUU’ => ‘V’, # Valine ‘CCA’ => ‘P’, ‘CCC’ => ‘P’, ‘CCG’ => ‘P’, ‘CCU’ => ‘P’, # Proline ‘UCA’ => ‘S’, ‘UCC’ => ‘S’, ‘UCG’ => ‘S’, ‘UCU’ => ‘S’, # Serine ‘CUA’ => ‘L’, ‘CUC’ => ‘L’, ‘CUG’ => ‘L’, ‘CUU’ => ‘L’, # Leucine ‘CGA’ => ‘R’, ‘CGC’ => ‘R’, ‘CGG’ => ‘R’, ‘CGU’ => ‘R’, # Arginine ‘ACA’ => ‘T’, ‘ACC’ => ‘T’, ‘ACG’ => ‘T’, ‘ACU’ => ‘T’, # Threonine ‘AUA’ => ‘I’, ‘AUC’ => ‘I’, ‘AUU’ => ‘I’, # Isoleucine ‘UAA’ => ‘_’, ‘UAG’ => ‘_’, ‘UGA’ => ‘_’, # Stop ‘AAA’ => ‘K’, ‘AAG’ => ‘K’, # Lysine ‘AGC’ => ‘S’, ‘AGU’ => ‘S’, # Serine ‘UUA’ => ‘L’, ‘UUG’ => ‘L’, # Leucine ‘AGA’ => ‘R’, ‘AGG’ => ‘R’, # Arginine ‘UAC’ => ‘Y’, ‘UAU’ => ‘Y’, # Tyrosine ‘UGC’ => ‘C’, ‘UGU’ => ‘C’, # Cysteine ‘CAA’ => ‘Q’, ‘CAG’ => ‘Q’, # Glutamine ‘AAC’ => ‘N’, ‘AAU’ => ‘N’, # Asparagine ‘UUC’ => ‘F’, ‘UUU’ => ‘F’, # Phenylalanine ‘GAC’ => ‘D’, ‘GAU’ => ‘D’, # Aspartic Acid ‘GAA’ => ‘E’, ‘GAG’ => ‘E’, # Glutamic Acid ‘UGG’ => ‘W’, # Tryptophan ‘CAC’ => ‘H’, # Histidine ‘AUG’ => ‘M’, # Methionine ‘CAU’ => ‘H’, # Histidine ); my ($protein) = “”; for(my $i=0;$i<length($rna)-2;$i+=3) { $codon = substr($rna,$i,3); $protein .= $genetic_code{$codon}; } print “Translated protein sequence is $protein”; #This program can also used for six reading frame, by changing the three #character shift in forward and reverse of the RNA sequence.