Overlap of Start/Stop coordinates
0
0
Entering edit mode
4.1 years ago

Hello,

I'm trying to find the overlap where neither the 5' start NOR the 3' end coordinates match with each other based on a reference file and a gene prediction file. However, my nested for loop is giving me 8000+ results, when I need only ~80. I'm not sure how to find the exact start/stop positions either that match these conditionals. My input files are a .gb file and a .glimmer file.

def main():

#initialize variables for counts
gb_count = 0
glimmer_count = 0
exact_count = 0
five_prime_count = 0 
three_prime_count = 0
no_matches_count = 0

#protein_id list
protein_id = []

#initialize lists for start/stop coordinates
reference = []
prediction = []
prediction_start = []
prediction_end = []
reference_start = []
reference_end = []
three_prime_agree = []
five_prime_agree = []




#read in GeneBank file
for line in open('.gb file'):

    line = line.rstrip()

    if "protein_id=" in line:
        pro_id = line.split("=")
        pro_id = pro_id[1].replace('"','')
        protein_id.append(pro_id)

    elif "CDS" in line:
        if "join" in line:
            continue

        elif "/translation" in line:
            continue

        elif "P" in line:
            continue

        elif "complement" in line:
            value = " ".join(line.split()).replace('CDS','').replace("(",'').replace(")",'').split("complement")
            newValue = value[1].split("..")
            ref_start = newValue[1].replace('>','').replace("' '", '').replace('<','')
            reference_start.append(ref_start)
            ref_end = newValue[0]
            reference_end.append(ref_end)
            gb_count += 1


        else:
            test = " ".join(line.split()).replace('CDS','').split("..")
            ref_start = test[0].replace(">",'').replace(" ", '')
            reference_start.append(ref_start)
            ref_end = test[1]
            reference_end.append(ref_end)
            gb_count += 1
        reference.append({'refstart': ref_start, 'refend': ref_end})

# Read in Glimmer/Prediction file
for l in open('.glimmer file'):
    l = l.rstrip()

    #get header
    if l.startswith(">"):
        l = l.split("  ")
        seq = l[1].split(" ")
        pre_start = seq[0]
        prediction_start.append(pre_start)
        pre_end = seq[1]
        prediction_end.append(pre_end)
        glimmer_count += 1
        prediction.append({'predictionStart': pre_start, 'predictionEnd': pre_end})

    #3' overlap
 for pre in prediction:
     for ref in reference:
         if (ref['refend'] == pre['predictionEnd']) and (ref['refstart'] != pre['predictionStart']):
             three_prime_count += 1
             test_pre.append("3' Agree")
             print("So far at least 3' ends match", three_prime_count)

## Neither overlap for pre in prediction: for ref in reference: if (ref['refend'] != pre['predictionEnd']) and (ref['refstart'] != pre['predictionStart']): no_matches_count += 1 print("No matches", no_matches_count)

python nested for loops • 509 views
ADD COMMENT

Login before adding your answer.

Traffic: 2727 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6