|
#wdecoster |
|
|
|
import sys, os |
|
from Bio import SeqIO |
|
|
|
def createDict(tsvfile): |
|
'''Input is a tsv file and returned is a dictionary with key=chromosome and value=list of tuples with positions''' |
|
if os.path.isfile(tsvfile): |
|
chrdict = { 'chr1' : [], 'chr2' : [], 'chr3' : [], 'chr4' : [], 'chr5' : [], 'chr6' : [], |
|
'chr7' : [], 'chr8' : [], 'chr9' : [], 'chr10' : [],'chr11' : [], 'chr12' : [], |
|
'chr13' : [], 'chr14' : [], 'chr15' : [], 'chr16' : [], 'chr17' : [], 'chr18' : [], |
|
'chr19' :[], 'chr20' : [], 'chr21' : [], 'chr22' : [], 'chrX' : [], 'chrY' : [], 'chrM' : []} |
|
for key in chrdict.keys(): |
|
if not os.path.isfile(dbdir + key + '.fa'): |
|
sys.exit("Couldn't find fasta file for {}, expected {} in {}".format(dictkey, dictkey + '.fa', dbdir)) |
|
with open(tsvfile, 'r') as input: |
|
for line in input.readlines(): |
|
linelist = line.strip().split('\t') |
|
try: |
|
chrdict['chr' + linelist[0]].append((int(linelist[1]), int(linelist[2]))) |
|
except KeyError: |
|
try: |
|
chrdict[linelist[0]].append((int(linelist[1]), int(linelist[2]))) |
|
except KeyError: |
|
if line.startswith('#'): |
|
continue |
|
if line.startswith('chromosome\tbegin\tend\ttype\tref'): |
|
continue |
|
else: |
|
sys.exit("Unexpected input line: {}".format(' '.join(linelist[:4]))) |
|
except ValueError: |
|
sys.exit("Unexpected input line: {}".format('\t'.join(linelist[:4]))) |
|
for key, value in chrdict.items(): |
|
if len(value) == 0: |
|
del chrdict[key] |
|
return(chrdict) |
|
else: |
|
sys.exit("The path to {} is incorrect or the file doesn't exist".format(tsvfile)) |
|
|
|
def natural_sort(l): |
|
import re |
|
'''Performs natural sorting of the input list, suitable for chromosomes''' |
|
convert = lambda text: int(text) if text.isdigit() else text.lower() |
|
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] |
|
return sorted(l, key = alphanum_key) |
|
|
|
def addrefnucl(dictkey): |
|
fasta = SeqIO.read(dbdir + dictkey + '.fa', "fasta") |
|
for position in tsvdict[dictkey]: |
|
referencenucleotide = fasta.seq[position[0]:position[1]].upper() |
|
print("{}".format('\t'.join([dictkey, str(position[0]), str(position[1]), str(referencenucleotide) + '/' + str(referencenucleotide)]))) |
|
|
|
dbdir = sys.argv[2] |
|
tsvdict = createDict(sys.argv[1]) |
|
|
|
for key in natural_sort(tsvdict.keys()): |
|
addrefnucl(key) |
Do you know a bit of Python? I have a script which almost does what you want.
Thank you for reply. I am scripting more in bash and awk.. But maybe I would understand..
How big is your file? Wondering whether to do everything in memory or stream.
It could be about 500000 rows and about 20 MB.