I guess this is a very simple question for most of the people here. But I really want to know how to solve it. Thanks.
When I got a fasta file from genbank, I want to remove sequences <500 aa and rename each sequence with gb accession number and common name. I wrote two small files to solve the problem. But I don't know how to write a main program to put them together. The input file is rdrp all ref.fasta
1 sequence length filter
from Bio import SeqIO
def long_seq(filename):
long_sequences = []
input_handle=open(filename,'rU')
output_handle = open("long_seqs.fasta", "w")
for record in SeqIO.parse(input_handle, "fasta") :
if len(record.seq) >= 500 :
# Add this record to our list
long_sequences.append(record)
print "Found %i long sequences" % len(long_sequences)
SeqIO.write(long_sequences, output_handle, "fasta")
input_handle.close()
output_handle.close()
if __name__=="__main__":
long_seq('rdrp all ref.fasta')
2 rename each sequence from genbank fasta file with gb number + common name
def rename_seq(filename):
input_handle=open(filename)
output_handle=open("rename_seq.fasta","w")
for line in input_handle:
if '>' in line:
id_old=line.strip()
a=id_old.find('.')
b=id_old.find('[')
c=id_old.find(']')
id_new='>'+id_old[(a-8):a]+' '+id_old[(b+1):c]
print>>output_handle, id_new
else:
seq=line.strip()
print>>output_handle, seq
input_handle.close()
output_handle.close()
if __name__=="__main__":
rename_seq('long_seqs.fasta')