Entering edit mode
8.2 years ago
pbeare
•
0
Hi,
I have a python script that allows me to extract the locus_tag
info from a genbank file but I cant seem to get it to work for also getting the old_locus_tag
info (which has the original gene names used in all current publications.
Here's my script;
#!/usr/bin/env python
import sys
from Bio import SeqIO
if len(sys.argv) is not 4:
sys.exit("usage: genbank_extract_gene.py <type: nucleotide/protein> <in genbank> <out parsed file>")
in_type = sys.argv[1]
if in_type not in ("protein", "nucleotide"):
sys.exit("usage: genbank_extract_gene.py <type: nucleotide/protein> <in genbank> <out parsed file>")
in_genbank = open(sys.argv[2], 'r')
out_fasta = open(sys.argv[3], 'w')
# use parse() method for genbanks with more than one genbank record, will work with just one too
full_record = SeqIO.parse(in_genbank, "genbank")
for seq_record in full_record:
contig = seq_record.id
strain = seq_record.annotations['source'].split(' ')[2]
#print strain
for feature in seq_record.features:
if in_type == "nucleotide":
# for nucleotide
if feature.type == "gene":
locus_tag = feature.qualifiers.get('locus_tag')[0]
#print locus_tag
start = feature.location.start
end = feature.location.end
# reverse and complement if needed
if feature.strand is 1:
nt_seq = seq_record.seq[start:end]
else:
nt_seq = seq_record.seq[start:end].reverse_complement()
if feature.qualifiers.get('gene'):
gene = feature.qualifiers.get('gene')[0]
#print gene
out_fasta.write(">{0}_{1}\n{2}\n".format(locus_tag, gene, nt_seq))
else:
out_fasta.write(">{0}\n{1}\n".format(locus_tag, nt_seq))
elif in_type == "protein":
# for amino acid
if feature.type == "CDS":
locus_tag = feature.qualifiers.get('locus_tag')[0]
translation = feature.qualifiers.get('translation')[0]
out_fasta.write(">{0}\n{1}\n".format(locus_tag, translation))
in_genbank.close()
out_fasta.close()
and example of my genbank file;
FEATURES Location/Qualifiers
source 1..2158758
/organism="Coxiella burnetii Dugway 5J108-111"
/mol_type="genomic DNA"
/strain="Dugway 5J108-111"
/db_xref="taxon:434922"
gene complement(6..119)
/locus_tag="CBUD_RS00005"
CDS complement(6..119)
/locus_tag="CBUD_RS00005"
/inference="EXISTENCE: similar to AA
sequence:RefSeq:WP_005769930.1"
/note="Derived by automated computational analysis using
gene prediction method: Protein Homology."
/codon_start=1
/transl_table=11
/product="hypothetical protein"
/protein_id="WP_005769930.1"
/db_xref="GI:492172326"
/translation="MEFVRHLFYLAVDNLSRVERKEIKVLIVHIFIELALE"
gene 140..1495
/gene="dnaA"
/locus_tag="CBUD_RS00010"
/old_locus_tag="CBUD_0001"
/locus_tag_other="COXBU7E912_0001"
CDS 140..1495
/gene="dnaA"
/locus_tag="CBUD_RS00010"
/old_locus_tag="CBUD_0001"
/locus_tag_other="COXBU7E912_0001"
/inference="EXISTENCE: similar to AA
sequence:SwissProt:Q83FD8.1"
/note="binds to the dnaA-box as an ATP-bound complex at
the origin of replication during the initiation of
chromosomal replication; can also affect transcription of
multiple genes including itself; Derived by automated
computational analysis using gene prediction method:
Protein Homology."
/codon_start=1
/transl_table=11
/product="chromosomal replication initiator protein DnaA"
/protein_id="WP_005769932.1"
Thanks in advance
Cheers
Paul
I added the tag Biopython which would help other people finding your question.