Question: (Closed) Parsing Pdb File Using Perl Script 2.
0
gravatar for hmarajkrishnan
5.2 years ago by
hmarajkrishnan0 wrote:
 #count for all amino acids existing in the protein
  $count_of_alanine=0;
 $count_of_arginine=0;
  $count_of_asparagine=0;
  $count_of_aspartic_acid=0;
   $count_of_cysteine=0;
 $count_of_glutamic_acid=0;
   $count_of_glutamine=0;
   $count_of_glycine=0;
   $count_of_histidine=0;
  $count_of_isoleucine=0;
  $count_of_leucine=0;
  $count_of_lysine=0;
   $count_of_methionine=0;
   $count_of_phenylalanine=0;
   $count_of_proline=0;
  $count_of_serine=0;
  $count_of_threonine=0;
   $count_of_tryptophan=0;
  $count_of_tyrosine=0;
   $count_of_valine=0;
  #count for all amino acids which hav z coordinate between -15 and 15
  $count_of_alanine2=0;
   $count_of_arginine2=0;
   $count_of_asparagine2=0;
  $count_of_aspartic_acid2=0;
  $count_of_cysteine2=0;
  $count_of_glutamic_acid2=0;
  $count_of_glutamine2=0;
   $count_of_glycine2=0;
   $count_of_histidine2=0;
  $count_of_isoleucine2=0;
 $count_of_leucine2=0;
    $count_of_lysine2=0;
 $count_of_methionine2=0;
 $count_of_phenylalanine2=0;
 $count_of_proline2=0;
   $count_of_serine2=0;
 $count_of_threonine2=0;
  $count_of_tryptophan2=0;
  $count_of_tyrosine2=0;
  $count_of_valine2=0;
 #count for groups of amino acids
  $count_of_charged=0;
   $count_of_polar=0;
  $count_of_aromatic=0;
   $count_of_hydrophobic=0;
 $count_of_charged2=0;
  $count_of_polar2=0;
  $count_of_aromatic2=0;
   $count_of_hydrophobic2=0;
   $count_of_charged3=0;
   $count_of_polar3=0;
   $count_of_aromatic3=0;
  $count_of_hydrophobic3=0;
   $count_of_charged4=0;
    $count_of_polar4=0;
   $count_of_aromatic4=0;
   $count_of_hydrophobic4=0;
   $count_of_charged5=0;
    $count_of_polar5=0;
   $count_of_aromatic5=0;
   $count_of_hydrophobic5=0;
   $count_of_charged6=0;
   $count_of_polar6=0;
   $count_of_aromatic6=0;
   $count_of_hydrophobic6=0;


 # input file query
 print "\nEnter the input file: ";
 $inputFile = <STDIN>;
 chomp $inputFile;

unless (open(INPUTFILE, $inputFile)) {
print "Cannot read from '$inputFile'.\nProgram closing.\n";
<STDIN>;
exit;
}

# load the file into an array
chomp(@dataArray = <INPUTFILE>);

# close the file
close(INPUTFILE);


#####################
# PARSE INPUT FILE! #
#####################

# parse the input file saving only backbone atoms coordinates
# format: [string "ATOM"] [number] [atom] [aa] whateva [3 decimal numbers] whateva with two dots in between
for ($line = 0; $line < scalar @dataArray; $line++) {
if ($dataArray[$line]=~/^HEADER\s+(.*?)$/) {
$header = $1;    
     }
if ($dataArray[$line]=~/^TITLE\s+(.*?)$/) {
$parsing{$line} = $1;
     }
if ($dataArray[$line] =~ m/ATOM\s+(\d+)\s+(\w+)\s+(\w{3})\s+.+\s+(\S+\.\S+)\s+(\S+\.\S+)\s+(\S+\.\S+)\s+.+\..+\..+/ig) {


    if (($2 eq "N" || $2 eq "CA" || $2 eq "C") && ($6 >= -15 && $6 <= 15) && (($3 eq "ARG") || ($3 eq "ASP") || ($3 eq "GLU") || ($3 eq "LYS"))) {
        $parsedData7{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;
}

    if ($2 eq "N" || $2 eq "CA" || $2 eq "C") {
        $parsedData{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;

if($3 eq "ALA"){
$count_of_alanine++;
}
if($3 eq "ARG"){
$count_of_arginine++;
}
if($3 eq "ASN"){
$count_of_asparagine++;
}
if($3 eq "ASP"){
$count_of_aspartic_acid++;
}
if($3 eq "CYS"){
$count_of_cysteine++;
}
if($3 eq "GLU"){
$count_of_glutamic_acid++;
}
if($3 eq "GLN"){
$count_of_glutamine++;
}
if($3 eq "GLY"){
$count_of_glycine++;
}
if($3 eq "HIS"){
$count_of_histidine++;
}
if($3 eq "ILE"){
$count_of_isoleucine++;
}
if($3 eq "LEU"){
$count_of_leucine++;
}
if($2 eq "LYS"){
$count_of_lysine++;
}
if($3 eq "MET"){
$count_of_methionine++;
}
if($3 eq "PHE"){
$count_of_phenylalanine++;
}
if($3 eq "PRO"){
$count_of_proline++;
}
if($3 eq "SER"){
$count_of_serine++;
}
if($3 eq "THR"){
$count_of_threonine++;
}
if($3 eq "TRP"){
$count_of_tryptophan++;
}
if($3 eq "TYR"){
$count_of_tyrosine++;
}
if($3 eq "VAL"){
$count_of_valine++;
}
if($3 eq "ARG"||$3 eq "ASP"||$3 eq "GLU"||$3 eq "LYS"){
$count_of_charged++;
}
if($3 eq "ASN"||$3 eq "GLN"||$3 eq "GLY"||$3 eq "MET"||$3 eq "PRO"){
$count_of_polar++;
}
if($3 eq "PHE"||$3 eq "TRP"||$3 eq "TYR"||$3 eq "HIS"){
$count_of_aromatic++;
}
if($3 eq "ALA"||$3 eq "ILE"||$3 eq "LEU"||$3 eq "VAL"){
$count_of_hydrophobic++;
}
}
 if (($2 eq "N" || $2 eq "CA" || $2 eq "C") && ($6 >= -15 && $6 <= 15)) {
        $parsedData2{$line} = $1."\t\t".$3."\t\t".$4."\t\t".$5."\t\t".$6;

if($3 eq "ALA"){
$count_of_alanine2++;
}
if($3 eq "ARG"){
$count_of_arginine2++;
}
if($3 eq "ASN"){
$count_of_asparagine2++;
}
if($3 eq "ASP"){
$count_of_aspartic_acid2++;
}
if($3 eq "CYS"){
$count_of_cysteine2++;
}
if($3 eq "GLU"){
$count_of_glutamic_acid2++;
}
if($3 eq "GLN"){
$count_of_glutamine2++;
}
if($3 eq "GLY"){
$count_of_glycine2++;
}
if($3 eq "HIS"){
$count_of_histidine2++;
}
if($3 eq "ILE"){
$count_of_isoleucine2++;
}
if($3 eq "LEU"){
$count_of_leucine2++;
}
if($3 eq "LYS"){
$count_of_lysine2++;
}
if($3 eq "MET"){
$count_of_methionine2++;
}
if($3 eq "PHE"){
$count_of_phenylalanine2++;
}
if($3 eq "PRO"){
$count_of_proline2++;
}
if($3 eq "SER"){
$count_of_serine2++;
}
if($3 eq "THR"){
$count_of_threonine2++;
}
if($3 eq "TRP"){
$count_of_tryptophan2++;
}
if($3 eq "TYR"){
$count_of_tyrosine2++;
}
if($3 eq "VAL"){
$count_of_valine2++;
}
if($3 eq "ARG"||$3 eq "ASP"||$3 eq "GLU"||$3 eq "LYS"){
$count_of_charged2++;
}
if($3 eq "ASN"||$3 eq "GLN"||$3 eq "GLY"||$3 eq "MET"||$3 eq "PRO"){
$count_of_polar2++;
}
if($3 eq "PHE"||$3 eq "TRP"||$3 eq "TYR"||$3 eq "HIS"){
$count_of_aromatic2++;
}
if($3 eq "ALA"||$3 eq "ILE"||$3 eq "LEU"||$3 eq "VAL"){
$count_of_hydrophobic2++;
}
}

  }
  }


     # create the output file name
    $outputFile = "coordinates_".$inputFile;

  # open the output file
  open (OUTFILE, ">$outputFile");


# print the data lines
print OUTFILE $header, "\n";
foreach $line (sort {$a <=> $b} keys %parsing) {
print OUTFILE $parsing{$line}."\n";
}
print OUTFILE $title, "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL BACKBONE AMINO ACIDS IN THE MEMBRANE PROTEIN-----------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";        
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
 foreach $line (sort {$a <=> $b} keys %parsedData) {
print OUTFILE $parsedData{$line}."\n";
}
 print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL AMINO ACIDS WITH Z COORDINATE > -15 && < 15------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData2) {
print OUTFILE $parsedData2{$line}."\n";
}   
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL Charged AMINO ACIDS WITH Z COORDINATE > -15 && < 15------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData7) {
print OUTFILE $parsedData7{$line}."\n";
}   
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------------ALL AMINO ACIDS WITH Z COORDINATE > -5 && < 5--------------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData5) {
print OUTFILE $parsedData2{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "------ALL AMINO ACIDS WITH Z COORDINATE > -15 && < -5// >5 && <15------\n";
print OUTFILE "-----------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Atom Number\tAmino acid\tX coordinate\tY Coordinate\tZ Coordinate\n";
foreach $line (sort {$a <=> $b} keys %parsedData6) {
print OUTFILE $parsedData2{$line}."\n";
}
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "-----AMINO ACID NUMBERS AND PERCENTAGE FOR THE BACKBONE AMINO ACIDS-----\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Amino acid\tTotal Number(N)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine++, "\t\t\n";
print OUTFILE "Arginine\t", $count_of_arginine++, "\t\t\n";
print OUTFILE "Asparagine\t", $count_of_asparagine++, "\t\t\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid++, "\t\t\n";
print OUTFILE "Cysteine\t", $count_of_cysteine++, "\t\t\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid++, "\t\t\n";
print OUTFILE "Glutamine\t",$count_of_glutamine++, "\t\t\n";
print OUTFILE "Glycine\t\t",$count_of_glycine++, "\t\t\n";
print OUTFILE "Histidine\t",$count_of_histidine++, "\t\t\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine++, "\t\t\n";
print OUTFILE "Leucine\t\t",$count_of_leucine++, "\t\t\n";
print OUTFILE "Lysine\t\t",$count_of_lysine++, "\t\t\n";
print OUTFILE "Methionine\t",$count_of_methionine++, "\t\t\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine++, "\t\t\n";
print OUTFILE "Proline\t\t",$count_of_proline++, "\t\t\n";
print OUTFILE "Serine\t\t",$count_of_serine++, "\t\t\n";
print OUTFILE "Threonine\t",$count_of_threonine++, "\t\t\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan++, "\t\t\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine++, "\t\t\n";
print OUTFILE "Valine\t\t",$count_of_valine++, "\t\t\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData), "\t\t\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged++,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar++,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic++,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic++,"\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "----AMINO ACID NUMBERS AND PERCENTAGE FOR Z COORDINATE > -15 && < 15----\n";
print OUTFILE "------------------------------------------------------------------------\n";
print OUTFILE "\n";    
print OUTFILE "Amino acid\tTotal Number(N)\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Alanine\t\t", $count_of_alanine2++, "\t\t\n";
print OUTFILE "Arginine\t", $count_of_arginine2++, "\t\t\n";
print OUTFILE "Asparagine\t", $count_of_asparagine2++, "\t\t\n";
print OUTFILE "Aspartic Acid\t", $count_of_aspartic_acid2++, "\t\t\n";
print OUTFILE "Cysteine\t", $count_of_cysteine2++, "\t\t\n";
print OUTFILE "Glutamic Acid\t",$count_of_glutamic_acid2++, "\t\t\n";
print OUTFILE "Glutamine\t",$count_of_glutamine2++, "\t\t\n";
print OUTFILE "Glycine\t\t",$count_of_glycine2++, "\t\t\n";
print OUTFILE "Histidine\t",$count_of_histidine2++, "\t\t\n";
print OUTFILE "Isoleucine\t",$count_of_isoleucine2++, "\t\t\n";
print OUTFILE "Leucine\t\t",$count_of_leucine2++, "\t\t\n";
print OUTFILE "Lysine\t\t",$count_of_lysine2++, "\t\t\n";
print OUTFILE "Methionine\t",$count_of_methionine2++, "\t\t\n";
print OUTFILE "Phenylalanine\t",$count_of_phenylalanine2++, "\t\t\n";
print OUTFILE "Proline\t\t",$count_of_proline2++, "\t\t\n";
print OUTFILE "Serine\t\t",$count_of_serine2++, "\t\t\n";
print OUTFILE "Threonine\t",$count_of_threonine2++, "\t\t\n";
print OUTFILE "Tryptophan\t",$count_of_tryptophan2++, "\t\t\n";
print OUTFILE "Tyrosine\t",$count_of_tyrosine2++, "\t\t\n";
print OUTFILE "Valine\t\t",$count_of_valine2++, "\t\t\n";
print OUTFILE "--------------------------------------------------\n";
print OUTFILE "Total\t\t", scalar(keys %parsedData2), "\t\t\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged2++,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar2++,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic2++,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic2++,"\n";
print OUTFILE "\n";
print OUTFILE "\n";
print OUTFILE "CHARGED AMINO ACIDS\t\t\t",$count_of_charged4++,"\n";
print OUTFILE "POLAR AMINO ACIDS\t\t\t",$count_of_polar4++,"\n";
print OUTFILE "AROMATIC AMINO ACIDS\t\t\t",$count_of_aromatic4++,"\n";
print OUTFILE "HYDROPHOBIC AMINO ACIDS\t\t\t",$count_of_hydrophobic4++,"\n";
print OUTFILE "\n";
print OUTFILE "\n";



# close the output file
close (OUTFILE);

# end message
print "The coordinates of '$inputFile' were saved into '$outputFile'.\n";

# end the program
exit;
pdb perl script • 1.9k views
ADD COMMENTlink written 5.2 years ago by hmarajkrishnan0

everything works perfectly but when comes to the output files, the number of proteins in a pdb file are increasing as im using many .pdb files simultaneously. its like stacking with each other or could be something else which i cant figure out the exact problem with this perl script... help me please...

ADD REPLYlink written 5.2 years ago by hmarajkrishnan0

This is a duplicate of your original post. Please edit the original.

ADD REPLYlink written 5.2 years ago by Matt Shirley8.9k
Please log in to add an answer.
The thread is closed. No new answers may be added.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1626 users visited in the last hour