Hello,
I'm trying to find the overlap where neither the 5' start NOR the 3' end coordinates match with each other based on a reference file and a gene prediction file. However, my nested for loop is giving me 8000+ results, when I need only ~80. I'm not sure how to find the exact start/stop positions either that match these conditionals. My input files are a .gb file and a .glimmer file.
def main():
#initialize variables for counts
gb_count = 0
glimmer_count = 0
exact_count = 0
five_prime_count = 0
three_prime_count = 0
no_matches_count = 0
#protein_id list
protein_id = []
#initialize lists for start/stop coordinates
reference = []
prediction = []
prediction_start = []
prediction_end = []
reference_start = []
reference_end = []
three_prime_agree = []
five_prime_agree = []
#read in GeneBank file
for line in open('.gb file'):
line = line.rstrip()
if "protein_id=" in line:
pro_id = line.split("=")
pro_id = pro_id[1].replace('"','')
protein_id.append(pro_id)
elif "CDS" in line:
if "join" in line:
continue
elif "/translation" in line:
continue
elif "P" in line:
continue
elif "complement" in line:
value = " ".join(line.split()).replace('CDS','').replace("(",'').replace(")",'').split("complement")
newValue = value[1].split("..")
ref_start = newValue[1].replace('>','').replace("' '", '').replace('<','')
reference_start.append(ref_start)
ref_end = newValue[0]
reference_end.append(ref_end)
gb_count += 1
else:
test = " ".join(line.split()).replace('CDS','').split("..")
ref_start = test[0].replace(">",'').replace(" ", '')
reference_start.append(ref_start)
ref_end = test[1]
reference_end.append(ref_end)
gb_count += 1
reference.append({'refstart': ref_start, 'refend': ref_end})
# Read in Glimmer/Prediction file
for l in open('.glimmer file'):
l = l.rstrip()
#get header
if l.startswith(">"):
l = l.split(" ")
seq = l[1].split(" ")
pre_start = seq[0]
prediction_start.append(pre_start)
pre_end = seq[1]
prediction_end.append(pre_end)
glimmer_count += 1
prediction.append({'predictionStart': pre_start, 'predictionEnd': pre_end})
#3' overlap
for pre in prediction:
for ref in reference:
if (ref['refend'] == pre['predictionEnd']) and (ref['refstart'] != pre['predictionStart']):
three_prime_count += 1
test_pre.append("3' Agree")
print("So far at least 3' ends match", three_prime_count)
## Neither overlap for pre in prediction: for ref in reference: if (ref['refend'] != pre['predictionEnd']) and (ref['refstart'] != pre['predictionStart']): no_matches_count += 1 print("No matches", no_matches_count)