|
#!/usr/bin/perl -w |
|
# |
|
# use this to remove stop codons from an alignment |
|
# typically, this would be done to calculate dN/dS in HYPHY |
|
# Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta |
|
# use this to replace stop codons from the nucleotide alignment |
|
# the nucleotide and the peptide alignments are necessary |
|
|
|
|
|
use strict; |
|
use Getopt::Long; |
|
use Bio::SeqIO; |
|
|
|
my ($inpep,$innuc,$output, $i, %stop); |
|
&GetOptions( |
|
'pep:s' => $inpep,# |
|
'nuc:s' => $innuc, |
|
'output:s' => $output,#file without gaps |
|
); |
|
|
|
|
|
my $pep = Bio::SeqIO->new(-file => "$inpep" , '-format' => 'fasta'); |
|
my $nuc = Bio::SeqIO->new(-file => "$innuc" , '-format' => 'fasta'); |
|
my $out = Bio::SeqIO->new(-file => ">$output" , '-format' => 'fasta'); |
|
|
|
while ( my $pepseq = $pep->next_seq() ) { |
|
my $pep_str=uc($pepseq->seq); |
|
if ($pep_str=~/\*/){ |
|
my $pep_id=$pepseq->id(); |
|
my @aa=split(//,uc($pepseq->seq)); |
|
for ($i=0; $i<scalar(@aa); $i++){ |
|
if ($aa[$i]=~/\*/){ |
|
$stop{$pep_id}{$i}++; |
|
print "$pep_id peptide sequence has a stop $aa[$i] at ".($i+1)."\n"; |
|
} |
|
} |
|
} |
|
} |
|
while (my $nucseq = $nuc->next_seq()){ |
|
my $nuc_id=$nucseq->id(); |
|
my $nuc_str=uc($nucseq->seq); |
|
foreach my $pid (keys %stop){ |
|
|
|
if ("$nuc_id" eq "$pid"){ |
|
foreach my $site (keys %{$stop{$pid}}){ |
|
#print "match $nuc_id and $pid\n"; |
|
#print "The sequence for $nuc_id is \n$nuc_str\n"; |
|
my $nucpos=$site*3; |
|
my $codon = substr $nuc_str, $nucpos, 3; |
|
print "$codon "; |
|
if ($codon =~ /(((U|T)A(A|G|R))|((T|U)GA))/i){ |
|
substr($nuc_str, $nucpos, 3) = '---'; |
|
print "=> Match to a stop codon at nucleotide position ".($nucpos+1)."\nNew sequence for $nuc_id\n$nuc_str\n"; |
|
}else{ |
|
print "Doesn't seem to match a stop codon at nucleotide position ".($nucpos+1)." in $nuc_id\n"; |
|
} |
|
} |
|
} |
|
} |
|
my $newseq = Bio::Seq->new(-seq => "$nuc_str", |
|
-display_id => $nuc_id); |
|
$out->write_seq($newseq); |
|
} |
|
|
what aligner would you like to use? Most, if not all, have a command line. Deleting the stop codons afterwards should be "trivial". E.g. biopython has an interface for most aligners and will run PAML as well. However, please keep in mind that dN/dS calculations are (obviously) very dependent on a good alignment. The huge downside of this automated approach will be that you will likely not quality check each alignment before moving on.
Hello, I have a similar problem?In big data,I must delete the stop codons in the sequence.So,can you give me some suggestions?
Thanks!