Question: Perl Parser To Extract Data From Pdb Files
0
gravatar for johnprakash063
4.8 years ago by
johnprakash0630 wrote:
# coordExtract.pl
#     # parser that allows editing a pdb
# file to extract the coordinates

my @pdbFiles = glob("*.pdb");
foreach my $pdbFile (@pdbFiles){

#count for all amino acids existing in the protein
$count_of_alanine=0;
$count_of_arginine=0;  
$count_of_asparagine=0;
$count_of_aspartic_acid=0;
$count_of_cysteine=0;
$count_of_glutamic_acid=0;
$count_of_glutamine=0;
$count_of_glycine=0;
$count_of_histidine=0;
$count_of_isoleucine=0;
$count_of_leucine=0;
$count_of_lysine=0;
$count_of_methionine=0; 
$count_of_phenylalanine=0;
$count_of_proline=0;
$count_of_serine=0;
$count_of_threonine=0;
$count_of_tryptophan=0;
$count_of_tyrosine=0;
$count_of_valine=0;
#count for all amino acids which hav z coordinate between -15 and 15
$count_of_alanine2=0;
$count_of_arginine2=0;
$count_of_asparagine2=0;
$count_of_aspartic_acid2=0;
$count_of_cysteine2=0;
$count_of_glutamic_acid2=0;
$count_of_glutamine2=0;
$count_of_glycine2=0;
$count_of_histidine2=0;
$count_of_isoleucine2=0;
$count_of_leucine2=0;
$count_of_lysine2=0;
$count_of_methionine2=0;
$count_of_phenylalanine2=0;
$count_of_proline2=0;
$count_of_serine2=0;
$count_of_threonine2=0;
$count_of_tryptophan2=0;
$count_of_tyrosine2=0;
$count_of_valine2=0;
#count for groups of amino acids
$count_of_charged=0;
$count_of_polar=0;
$count_of_aromatic=0;
$count_of_hydrophobic=0;
$count_of_charged2=0;
$count_of_polar2=0;
$count_of_aromatic2=0;
$count_of_hydrophobic2=0;


unless (open(INPUTFILE, $pdbFile)) {
    print "Cannot read from '$pdbFile'.\nProgram closing.\n";
    <STDIN>;
    #exit;
}

# load the file into an array
chomp(@dataArray = <INPUTFILE>);

# close the file
close(INPUTFILE);


###############
# LET'S WORK! #
###############

# parse the input file saving only backbone atoms coordinates
# format: [string "ATOM"] [number] [atom] [aa] whateva [3 decimal numbers] whateva with two dots in between
for ($line = 0; $line < scalar @dataArray; $line++) {
if ($dataArray[$line]=~/^HEADER\s+(.*?)$/) {
$header = $1;    
     }
if ($dataArray[$line]=~/^TITLE\s+(.*?)$/) {
$parsing{$line} = $1;
     }
if ($dataArray[$line] =~ m/ATOM\s+(\d+)\s+(\w+)\s+(\w{3})\s+.+\s+(\S+\.\S+)\s+(\S+\.\S+)\s+(\S+\.\S+)\s+.+\..+\..+/ig) {


    if (($2 eq "N" || $2 eq "CA" || $2 eq "C") && ($6 >= -15 && $6 <= 15) && (($3 eq "ARG") || ($3 eq "ASP") || ($3 eq "GLU") || ($3 eq "LYS"))) {
        $parsedData7{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;
}

    if ($2 eq "N" || $2 eq "CA" || $2 eq "C") {
        $parsedData{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;

if($3 eq "ALA"){
$count_of_alanine++;
}
if($3 eq "ARG"){
$count_of_arginine++;
}
if($3 eq "ASN"){
$count_of_asparagine++;
}
if($3 eq "ASP"){
$count_of_aspartic_acid++;
}
if($3 eq "CYS"){
$count_of_cysteine++;
}
if($3 eq "GLU"){
$count_of_glutamic_acid++;
}
if($3 eq "GLN"){
$count_of_glutamine++;
}
if($3 eq "GLY"){
$count_of_glycine++;
}
if($3 eq "HIS"){
$count_of_histidine++;
}
if($3 eq "ILE"){
$count_of_isoleucine++;
}
if($3 eq "LEU"){
$count_of_leucine++;
}
if($3 eq "LYS"){
$count_of_lysine++;
}
if($3 eq "MET"){
$count_of_methionine++;
}
if($3 eq "PHE"){
$count_of_phenylalanine++;
}
if($3 eq "PRO"){
$count_of_proline++;
}
if($3 eq "SER"){
$count_of_serine++;
}
if($3 eq "THR"){
$count_of_threonine++;
}
if($3 eq "TRP"){
$count_of_tryptophan++;
}
if($3 eq "TYR"){
$count_of_tyrosine++;
}
if($3 eq "VAL"){
$count_of_valine++;
}
if($3 eq "ARG"||$3 eq "ASP"||$3 eq "GLU"||$3 eq "LYS"){
$count_of_charged++;
}
if($3 eq "ASN"||$3 eq "GLN"||$3 eq "GLY"||$3 eq "MET"||$3 eq "PRO"){
$count_of_polar++;
}
if($3 eq "PHE"||$3 eq "TRP"||$3 eq "TYR"||$3 eq "HIS"){
$count_of_aromatic++;
}
if($3 eq "ALA"||$3 eq "ILE"||$3 eq "LEU"||$3 eq "VAL"){
$count_of_hydrophobic++;
}
}
if (($2 eq "N" || $2 eq "CA" || $2 eq "C") && ($6 >= -15 && $6 <= 15)) {
        $parsedData2{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;

if($3 eq "ALA"){
$count_of_alanine2++;
}
if($3 eq "ARG"){
$count_of_arginine2++;
}
if($3 eq "ASN"){
$count_of_asparagine2++;
}
if($3 eq "ASP"){
$count_of_aspartic_acid2++;
}
if($3 eq "CYS"){
$count_of_cysteine2++;
}
if($3 eq "GLU"){
$count_of_glutamic_acid2++;
}
if($3 eq "GLN"){
$count_of_glutamine2++;
}
if($3 eq "GLY"){
$count_of_glycine2++;
}
if($3 eq "HIS"){
$count_of_histidine2++;
}
if($3 eq "ILE"){
$count_of_isoleucine2++;
}
if($3 eq "LEU"){
$count_of_leucine2++;
}
if($3 eq "LYS"){
$count_of_lysine2++;
}
if($3 eq "MET"){
$count_of_methionine2++;
}
if($3 eq "PHE"){
$count_of_phenylalanine2++;
}
if($3 eq "PRO"){
$count_of_proline2++;
}
if($3 eq "SER"){
$count_of_serine2++;
}
if($3 eq "THR"){
$count_of_threonine2++;
}
if($3 eq "TRP"){
$count_of_tryptophan2++;
}
if($3 eq "TYR"){
$count_of_tyrosine2++;
}
if($3 eq "VAL"){
$count_of_valine2++;
}
if($3 eq "ARG"||$3 eq "ASP"||$3 eq "GLU"||$3 eq "LYS"){
$count_of_charged2++;
}
if($3 eq "ASN"||$3 eq "GLN"||$3 eq "GLY"||$3 eq "MET"||$3 eq "PRO"){
$count_of_polar2++;
}
if($3 eq "PHE"||$3 eq "TRP"||$3 eq "TYR"||$3 eq "HIS"){
$count_of_aromatic2++;
}
if($3 eq "ALA"||$3 eq "ILE"||$3 eq "LEU"||$3 eq "VAL"){
$count_of_hydrophobic2++;
}
}
}
}


# create the output file name
$outputFile = "coordinates_".$pdbFile;

# open the output file
open (OUTFILE, ">$outputFile");


# print the data lines
print OUTFILE $header, "\n";
foreach $line (sort {$a <=> $b} keys %parsing) {
print OUTFILE $parsing{$line}."\n";
}
print OUTFILE $title, "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL BACKBONE AMINO ACIDS IN THE MEMBRANE PROTEIN-----------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";        
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData) {
print OUTFILE $parsedData{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL AMINO ACIDS WITH Z COORDINATE > -15 && < 15------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData2) {
print OUTFILE $parsedData2{$line}."\n";
}   
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL Charged AMINO ACIDS WITH Z COORDINATE > -15 && < 15------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData7) {
print OUTFILE $parsedData7{$line}."\n";
}   
print OUTFILE "\n";
print OUTFILE "\n";
 print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "-----AMINO ACID NUMBERS AND PERCENTAGE FOR THE BACKBONE AMINO ACIDS-----\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Amino acid\tTotal Number(N)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine, "\t\t\n";
print OUTFILE "Arginine\t", $count_of_arginine, "\t\t\n";
print OUTFILE "Asparagine\t", $count_of_asparagine, "\t\t\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid, "\t\t\n";
print OUTFILE "Cysteine\t", $count_of_cysteine, "\t\t\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid, "\t\t\n";
print OUTFILE "Glutamine\t",$count_of_glutamine, "\t\t\n";
print OUTFILE "Glycine\t\t",$count_of_glycine, "\t\t\n";
print OUTFILE "Histidine\t",$count_of_histidine, "\t\t\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine, "\t\t\n";
print OUTFILE "Leucine\t\t",$count_of_leucine, "\t\t\n";
print OUTFILE "Lysine\t\t",$count_of_lysine, "\t\t\n";
print OUTFILE "Methionine\t",$count_of_methionine, "\t\t\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine, "\t\t\n";
print OUTFILE "Proline\t\t",$count_of_proline, "\t\t\n";
print OUTFILE "Serine\t\t",$count_of_serine, "\t\t\n";
print OUTFILE "Threonine\t",$count_of_threonine, "\t\t\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan, "\t\t\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine, "\t\t\n";
print OUTFILE "Valine\t\t",$count_of_valine, "\t\t\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData), "\t\t\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic,"\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "----AMINO ACID NUMBERS AND PERCENTAGE FOR Z COORDINATE > -15 && < 15----\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Amino acid\tTotal Number(N)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine2, "\t\t\n";
print OUTFILE "Arginine\t", $count_of_arginine2, "\t\t\n";
print OUTFILE "Asparagine\t", $count_of_asparagine2, "\t\t\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid2, "\t\t\n";
print OUTFILE "Cysteine\t", $count_of_cysteine2, "\t\t\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid2, "\t\t\n";
print OUTFILE "Glutamine\t",$count_of_glutamine2, "\t\t\n";
print OUTFILE "Glycine\t\t",$count_of_glycine2, "\t\t\n";
print OUTFILE "Histidine\t",$count_of_histidine2, "\t\t\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine2, "\t\t\n";
print OUTFILE "Leucine\t\t",$count_of_leucine2, "\t\t\n";
print OUTFILE "Lysine\t\t",$count_of_lysine2, "\t\t\n";
print OUTFILE "Methionine\t",$count_of_methionine2, "\t\t\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine2, "\t\t\n";
print OUTFILE "Proline\t\t",$count_of_proline2, "\t\t\n";
print OUTFILE "Serine\t\t",$count_of_serine2, "\t\t\n";
print OUTFILE "Threonine\t",$count_of_threonine2, "\t\t\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan2, "\t\t\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine2, "\t\t\n";
print OUTFILE "Valine\t\t",$count_of_valine2, "\t\t\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData2), "\t\t\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged2,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar2,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic2,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic2,"\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------\n";
print OUTFILE "------------------------------------------------------------------\n";



# close the output file
close (OUTFILE);

# end message
print "The coordinates of '$pdbFile' were saved into '$outputFile'.\n";
}
# end the program
exit;
bioinformatics pdb perl • 2.3k views
ADD COMMENTlink modified 4.0 years ago by Istvan Albert ♦♦ 77k • written 4.8 years ago by johnprakash0630

i need this loop to run a directory with many .pdb files... so far it works... but the problem is the values of $parsedData{$line} keep stacking up from d first file to the last, meaning the size of the files keep increasing, and the values are false... d rest of the values(count++) are all running correctly...how can i solve dis problem please... any ideas... tq

ADD REPLYlink written 4.8 years ago by johnprakash0630
0
gravatar for Matt Shirley
4.8 years ago by
Matt Shirley8.6k
Cambridge, MA
Matt Shirley8.6k wrote:

I would start by simplifying your script to make it operate on only one file at a time, produce one output file for each input file, and then use a bash loop to run it on the directory of files. Unless this is a homework assignment where the instructor has given you an assignment to match patterns using if/else while reading all files form a directory in a loop. If you can use it, I also suggest the Bio::Perl pdb parser to do the IO.

ADD COMMENTlink modified 4.8 years ago • written 4.8 years ago by Matt Shirley8.6k

actually the bash works for mac platform, but now m working mainly on windows platform, dats the problem actually... and yes, the file does actually read one by one, dats d original file i did, i juz added the loop recently...

ADD REPLYlink written 4.8 years ago by johnprakash0630
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1432 users visited in the last hour