I need this loop to run a directory with many .pdb files. So far it works but the problem is the values of $parsedData{$line}
keep stacking up from the first file to the last, meaning the size of the files keep increasing, and the values are false. The rest of the values(count++
) are all running correctly. How can I solve this problem please? Any ideas?
Thank you!
# coordExtract.pl
# # parser that allows editing a pdb
# file to extract the coordinates
my @pdbFiles = glob("*.pdb");
foreach my $pdbFile (@pdbFiles){
#count for all amino acids existing in the protein
$count_of_alanine=0;
$count_of_arginine=0;
$count_of_asparagine=0;
$count_of_aspartic_acid=0;
$count_of_cysteine=0;
$count_of_glutamic_acid=0;
$count_of_glutamine=0;
$count_of_glycine=0;
$count_of_histidine=0;
$count_of_isoleucine=0;
$count_of_leucine=0;
$count_of_lysine=0;
$count_of_methionine=0;
$count_of_phenylalanine=0;
$count_of_proline=0;
$count_of_serine=0;
$count_of_threonine=0;
$count_of_tryptophan=0;
$count_of_tyrosine=0;
$count_of_valine=0;
#count for all amino acids which hav z coordinate between -15 and 15
$count_of_alanine2=0;
$count_of_arginine2=0;
$count_of_asparagine2=0;
$count_of_aspartic_acid2=0;
$count_of_cysteine2=0;
$count_of_glutamic_acid2=0;
$count_of_glutamine2=0;
$count_of_glycine2=0;
$count_of_histidine2=0;
$count_of_isoleucine2=0;
$count_of_leucine2=0;
$count_of_lysine2=0;
$count_of_methionine2=0;
$count_of_phenylalanine2=0;
$count_of_proline2=0;
$count_of_serine2=0;
$count_of_threonine2=0;
$count_of_tryptophan2=0;
$count_of_tyrosine2=0;
$count_of_valine2=0;
#count for groups of amino acids
$count_of_charged=0;
$count_of_polar=0;
$count_of_aromatic=0;
$count_of_hydrophobic=0;
$count_of_charged2=0;
$count_of_polar2=0;
$count_of_aromatic2=0;
$count_of_hydrophobic2=0;
unless (open(INPUTFILE, $pdbFile)) {
print "Cannot read from '$pdbFile'.\nProgram closing.\n";
<STDIN>;
#exit;
}
# load the file into an array
chomp(@dataArray = <INPUTFILE>);
# close the file
close(INPUTFILE);
###############
# LET'S WORK! #
###############
# parse the input file saving only backbone atoms coordinates
# format: [string "ATOM"] [number] [atom] [aa] whateva [3 decimal numbers] whateva with two dots in between
for ($line = 0; $line < scalar @dataArray; $line++) {
if ($dataArray[$line]=~/^HEADER\s+(.*?)$/) {
$header = $1;
}
if ($dataArray[$line]=~/^TITLE\s+(.*?)$/) {
$parsing{$line} = $1;
}
if ($dataArray[$line] =~ m/ATOM\s+(\d+)\s+(\w+)\s+(\w{3})\s+.+\s+(\S+\.\S+)\s+(\S+\.\S+)\s+(\S+\.\S+)\s+.+\..+\..+/ig) {
if (($2 eq "N" || $2 eq "CA" || $2 eq "C") && ($6 >= -15 && $6 <= 15) && (($3 eq "ARG") || ($3 eq "ASP") || ($3 eq "GLU") || ($3 eq "LYS"))) {
$parsedData7{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;
}
if ($2 eq "N" || $2 eq "CA" || $2 eq "C") {
$parsedData{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;
if($3 eq "ALA"){
$count_of_alanine++;
}
if($3 eq "ARG"){
$count_of_arginine++;
}
if($3 eq "ASN"){
$count_of_asparagine++;
}
if($3 eq "ASP"){
$count_of_aspartic_acid++;
}
if($3 eq "CYS"){
$count_of_cysteine++;
}
if($3 eq "GLU"){
$count_of_glutamic_acid++;
}
if($3 eq "GLN"){
$count_of_glutamine++;
}
if($3 eq "GLY"){
$count_of_glycine++;
}
if($3 eq "HIS"){
$count_of_histidine++;
}
if($3 eq "ILE"){
$count_of_isoleucine++;
}
if($3 eq "LEU"){
$count_of_leucine++;
}
if($3 eq "LYS"){
$count_of_lysine++;
}
if($3 eq "MET"){
$count_of_methionine++;
}
if($3 eq "PHE"){
$count_of_phenylalanine++;
}
if($3 eq "PRO"){
$count_of_proline++;
}
if($3 eq "SER"){
$count_of_serine++;
}
if($3 eq "THR"){
$count_of_threonine++;
}
if($3 eq "TRP"){
$count_of_tryptophan++;
}
if($3 eq "TYR"){
$count_of_tyrosine++;
}
if($3 eq "VAL"){
$count_of_valine++;
}
if($3 eq "ARG"||$3 eq "ASP"||$3 eq "GLU"||$3 eq "LYS"){
$count_of_charged++;
}
if($3 eq "ASN"||$3 eq "GLN"||$3 eq "GLY"||$3 eq "MET"||$3 eq "PRO"){
$count_of_polar++;
}
if($3 eq "PHE"||$3 eq "TRP"||$3 eq "TYR"||$3 eq "HIS"){
$count_of_aromatic++;
}
if($3 eq "ALA"||$3 eq "ILE"||$3 eq "LEU"||$3 eq "VAL"){
$count_of_hydrophobic++;
}
}
if (($2 eq "N" || $2 eq "CA" || $2 eq "C") && ($6 >= -15 && $6 <= 15)) {
$parsedData2{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;
if($3 eq "ALA"){
$count_of_alanine2++;
}
if($3 eq "ARG"){
$count_of_arginine2++;
}
if($3 eq "ASN"){
$count_of_asparagine2++;
}
if($3 eq "ASP"){
$count_of_aspartic_acid2++;
}
if($3 eq "CYS"){
$count_of_cysteine2++;
}
if($3 eq "GLU"){
$count_of_glutamic_acid2++;
}
if($3 eq "GLN"){
$count_of_glutamine2++;
}
if($3 eq "GLY"){
$count_of_glycine2++;
}
if($3 eq "HIS"){
$count_of_histidine2++;
}
if($3 eq "ILE"){
$count_of_isoleucine2++;
}
if($3 eq "LEU"){
$count_of_leucine2++;
}
if($3 eq "LYS"){
$count_of_lysine2++;
}
if($3 eq "MET"){
$count_of_methionine2++;
}
if($3 eq "PHE"){
$count_of_phenylalanine2++;
}
if($3 eq "PRO"){
$count_of_proline2++;
}
if($3 eq "SER"){
$count_of_serine2++;
}
if($3 eq "THR"){
$count_of_threonine2++;
}
if($3 eq "TRP"){
$count_of_tryptophan2++;
}
if($3 eq "TYR"){
$count_of_tyrosine2++;
}
if($3 eq "VAL"){
$count_of_valine2++;
}
if($3 eq "ARG"||$3 eq "ASP"||$3 eq "GLU"||$3 eq "LYS"){
$count_of_charged2++;
}
if($3 eq "ASN"||$3 eq "GLN"||$3 eq "GLY"||$3 eq "MET"||$3 eq "PRO"){
$count_of_polar2++;
}
if($3 eq "PHE"||$3 eq "TRP"||$3 eq "TYR"||$3 eq "HIS"){
$count_of_aromatic2++;
}
if($3 eq "ALA"||$3 eq "ILE"||$3 eq "LEU"||$3 eq "VAL"){
$count_of_hydrophobic2++;
}
}
}
}
# create the output file name
$outputFile = "coordinates_".$pdbFile;
# open the output file
open (OUTFILE, ">$outputFile");
# print the data lines
print OUTFILE $header, "\n";
foreach $line (sort {$a <=> $b} keys %parsing) {
print OUTFILE $parsing{$line}."\n";
}
print OUTFILE $title, "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL BACKBONE AMINO ACIDS IN THE MEMBRANE PROTEIN-----------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData) {
print OUTFILE $parsedData{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL AMINO ACIDS WITH Z COORDINATE > -15 && < 15------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData2) {
print OUTFILE $parsedData2{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL Charged AMINO ACIDS WITH Z COORDINATE > -15 && < 15------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData7) {
print OUTFILE $parsedData7{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "-----AMINO ACID NUMBERS AND PERCENTAGE FOR THE BACKBONE AMINO ACIDS-----\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "\n";
print OUTFILE "Amino acid\tTotal Number(N)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine, "\t\t\n";
print OUTFILE "Arginine\t", $count_of_arginine, "\t\t\n";
print OUTFILE "Asparagine\t", $count_of_asparagine, "\t\t\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid, "\t\t\n";
print OUTFILE "Cysteine\t", $count_of_cysteine, "\t\t\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid, "\t\t\n";
print OUTFILE "Glutamine\t",$count_of_glutamine, "\t\t\n";
print OUTFILE "Glycine\t\t",$count_of_glycine, "\t\t\n";
print OUTFILE "Histidine\t",$count_of_histidine, "\t\t\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine, "\t\t\n";
print OUTFILE "Leucine\t\t",$count_of_leucine, "\t\t\n";
print OUTFILE "Lysine\t\t",$count_of_lysine, "\t\t\n";
print OUTFILE "Methionine\t",$count_of_methionine, "\t\t\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine, "\t\t\n";
print OUTFILE "Proline\t\t",$count_of_proline, "\t\t\n";
print OUTFILE "Serine\t\t",$count_of_serine, "\t\t\n";
print OUTFILE "Threonine\t",$count_of_threonine, "\t\t\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan, "\t\t\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine, "\t\t\n";
print OUTFILE "Valine\t\t",$count_of_valine, "\t\t\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData), "\t\t\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic,"\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "----AMINO ACID NUMBERS AND PERCENTAGE FOR Z COORDINATE > -15 && < 15----\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "\n";
print OUTFILE "Amino acid\tTotal Number(N)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine2, "\t\t\n";
print OUTFILE "Arginine\t", $count_of_arginine2, "\t\t\n";
print OUTFILE "Asparagine\t", $count_of_asparagine2, "\t\t\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid2, "\t\t\n";
print OUTFILE "Cysteine\t", $count_of_cysteine2, "\t\t\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid2, "\t\t\n";
print OUTFILE "Glutamine\t",$count_of_glutamine2, "\t\t\n";
print OUTFILE "Glycine\t\t",$count_of_glycine2, "\t\t\n";
print OUTFILE "Histidine\t",$count_of_histidine2, "\t\t\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine2, "\t\t\n";
print OUTFILE "Leucine\t\t",$count_of_leucine2, "\t\t\n";
print OUTFILE "Lysine\t\t",$count_of_lysine2, "\t\t\n";
print OUTFILE "Methionine\t",$count_of_methionine2, "\t\t\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine2, "\t\t\n";
print OUTFILE "Proline\t\t",$count_of_proline2, "\t\t\n";
print OUTFILE "Serine\t\t",$count_of_serine2, "\t\t\n";
print OUTFILE "Threonine\t",$count_of_threonine2, "\t\t\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan2, "\t\t\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine2, "\t\t\n";
print OUTFILE "Valine\t\t",$count_of_valine2, "\t\t\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData2), "\t\t\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged2,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar2,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic2,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic2,"\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------\n";
print OUTFILE "------------------------------------------------------------------\n";
# close the output file
close (OUTFILE);
# end message
print "The coordinates of '$pdbFile' were saved into '$outputFile'.\n";
}
# end the program
exit;
actually the bash works for mac platform, but now m working mainly on windows platform, dats the problem actually... and yes, the file does actually read one by one, dats d original file i did, i juz added the loop recently...