|
|
|
### Code by Jeremy.Cox@cchmc.org, provided as freeware without warranty ### |
|
### A library for processing blast hits which provide NCBI gid / accession numbers to extract meaningful taxonomy information.### |
|
### Warning: I did not write this for public consumption, so watch out for unintended bugs! ### |
|
|
|
### This code written BEFORE the switch to Accession numbers away from gid ### |
|
### Python 2.7 ### |
|
|
|
import re |
|
import sys |
|
import os |
|
from subprocess import Popen, PIPE |
|
|
|
PATH="/data/aplab/microbiome_genome_only/ncbi_gi_taxid/" |
|
# ncbi gi dump files location |
|
# PATH="./" |
|
|
|
LINNAEUS_FILTER = ["species","genus","family","order","class","phylum","kingdom","superkingdom"] |
|
|
|
def buildNames(): |
|
inFile = open(PATH+"names.dmp") |
|
|
|
result = dict() |
|
|
|
for line in inFile: |
|
line = line.replace("\t","") |
|
line = line.replace("|\n","") |
|
splits = line.split("|") |
|
if splits[-1] == 'scientific name': |
|
key = int( line.split("|")[0] ) |
|
name = line.split("|")[1] |
|
result[key] = name |
|
|
|
return result |
|
|
|
print >> sys.stderr, 'taxa_library_cluster initializing :: Building Names database' |
|
g_names = buildNames() |
|
|
|
def buildNodes(): |
|
inFile = open(PATH+"nodes.dmp") |
|
|
|
result = dict() |
|
levels = dict() |
|
|
|
for line in inFile: |
|
#print line |
|
line = line.replace("\t","") |
|
line = line.replace("|\n","") |
|
splits = line.split("|") |
|
#print splits |
|
taxaID = int(splits[0]) |
|
parentID = int(splits[1]) |
|
level = splits[2] |
|
result[taxaID] = (parentID, level) |
|
levels[level]=1 |
|
|
|
return (result,levels) |
|
|
|
print >> sys.stderr, 'taxa_library_cluster initializing :: Building Nodes database' |
|
(g_nodes, g_levels) = buildNodes() |
|
|
|
def buildLineage( x , names, nodes, filter = LINNAEUS_FILTER ): |
|
k = int(x) |
|
result={} |
|
|
|
while k != 1 and k != 131567: |
|
if k in nodes: |
|
(next, level) = nodes[k] |
|
if filter == None or level in filter: |
|
result[level] = (k,names[k]) |
|
k = next |
|
else: |
|
# https://docs.python.org/3/library/exceptions.html#exception-hierarchy |
|
raise AssertionError("Looking up taxa %d :: ancestor %d is not in taxonomy" % (x,k)) |
|
break |
|
|
|
return result |
|
|
|
def giListLookup( giList ): |
|
#takes a list of gi and returns dictionary mapping gi to taxaID |
|
taxonomy = {} |
|
for line in open(PATH+"gi_taxid_nucl.dmp"): |
|
(gi, taxid) = line.split() |
|
if gi in giList: |
|
taxonomy[gi] = int(taxid) |
|
return taxonomy |
|
|
|
def giLookup( gid ): |
|
result = -1 |
|
''' |
|
#commands = ["grep", "-Pm", "1", '"^'+str(gid)+'\'+'t"', PATH+"gi_taxid_nucl.dmp", "|", "cut", "-f2"] |
|
commands = ["grep", "-Pm", "1", '"^'+str(gid)+'"', PATH+"gi_taxid_nucl.dmp", "|", "cut", "-f2"] |
|
|
|
print >> sys.stderr, commands |
|
#temp = "" |
|
#for item in commands: |
|
# temp += item+" " |
|
#print >> sys.stderr, temp |
|
print >> sys.stderr, " ".join(commands) |
|
|
|
proc = Popen(commands, stdout=PIPE, bufsize=1) |
|
#proc = Popen(temp, stdout=PIPE, bufsize=1) |
|
for line in iter(proc.stdout.readline, b''): |
|
result = int(line) |
|
proc.communicate() |
|
''' |
|
commands = ["grep", "-Pm", "1", '"^'+str(gid)+'"', PATH+"gi_taxid_nucl.dmp", "|", "cut", "-f2", ">", "temp"] |
|
os.system(" ".join(commands)) |
|
for line in open("temp", "r"): |
|
result = int(line) |
|
return result |
|
|
|
def getSpeciesID( taxaID ): |
|
result = 0 |
|
currID = taxaID |
|
if not taxaID in g_nodes: |
|
result = -1 |
|
else: |
|
while currID != 1 and currID != 131567: |
|
if g_nodes[currID][1] == "species": |
|
result = currID |
|
break |
|
currID = g_nodes[currID][0] |
|
return result |
|
|
|
def getGID( text ): |
|
splits = text[1:].split("|") |
|
for k in range(len(splits)-1): |
|
if re.search("gi",splits[k]): |
|
return splits[k+1] |
|
return "-1" |
|
|
|
if __name__== "__main__": |
|
query = 4932 |
|
queryName = "Saccharomyces cerevisiae" |
|
|
|
names = buildNames() |
|
(nodes, levels) = buildNodes() |
|
|
|
taxonomy = buildLineage(query, names, nodes) |
|
|
|
result = str(query)+"\t"+queryName+"|" |
|
|
|
for level in LINNAEUS_FILTER: |
|
if level in taxonomy: |
|
result += "\t"+str(taxonomy[level][0])+"\t"+taxonomy[level][1]+"|" |
|
else: |
|
result += "\tnone\tnone" |
|
|
|
print result |
Assuming the following still works: Automatically Getting The Ncbi Taxonomy Id From The Genbank Identifier
You could also use
taxonomy.sh
from BBMap. You will first need to create a local copy of the tree usingtaxtree.sh
andnames.dump
file from NCBI FTP.