orf finder
0
0
Entering edit mode
2.5 years ago
e5cf5013 • 0

How can I find which frame is producing the final protein? Is there any way to set all the frames?

import re

filename = input('Enter name of file to parse: ')
sequences = []
descr = None
# here is the path of multifalsta file

with open(filename) as file:
    line = file.readline()[:-1]  # always trim newline
    while line:
        if line[0] == '>':
            if descr:  # any sequence found yet?
                sequences.append((descr, seq))
            descr = str(line[1:].split('>'))
            seq = ''  # start a new sequence
        else:
            seq += line
        line = file.readline()[:-1]
    sequences.append((descr, seq))


def find_all_starts(seq):
    """Find the starting index of all start codons in a lowercase seq"""
    # Compile regex for start codons
    regex_start = re.compile('atg')

    # Find the indices of all start codons
    starts = []
    for match in regex_start.finditer(seq):
        starts.append(match.start())

    return tuple(starts)

find_all_starts(seq)


def find_first_in_register_stop(seq):
    """
    Find first stop codon on lowercase seq that starts at an index
    that is divisible by three
    """
    # Compile regexes for stop codons
    regex_stop = re.compile('(taa|tag|tga)')

    # Stop codon iterator
    stop_iterator = regex_stop.finditer(seq)

    # Find next stop codon that is in register
    for stop in stop_iterator:
        if stop.end() % 3 == 0:
            return stop.end()

    # Return -1 if we failed to find a stop codon
    return -1
find_first_in_register_stop(seq)



def all_orfs(seq):
    """Return all ORFs of a sequence."""
    # Make sure sequence is all lower case
    seq = seq.lower()

    # Find the indices of all start codons
    start_inds = find_all_starts(seq)

    # Keep track of stops
    stop_inds = []

    # Initialze ORFs.  Each entry in list is [ORF length, ORF start, ORF stop]
    orfs = []

    # For each start codon, find the next stop codon in register
    for start in start_inds:
        relative_stop = find_first_in_register_stop(seq[start:])

        if relative_stop != -1:
            # Index of stop codon
            stop = start + relative_stop

            # If already had stop, a longer ORF contains this one
            if stop not in stop_inds:
                orfs.append((relative_stop, start, stop))
                stop_inds.append(stop)

    # Get sorted list of ORF length
    orfs = sorted(orfs, reverse=True)

    # Remove lengths
    for i, orf in enumerate(orfs):
        orfs[i] = (orf[1], orf[2])

    return tuple(orfs)

all_orfs(seq)



def longest_orf(seq):
    """Longest ORF of a sequence."""
    orfs = all_orfs(seq)

    if len(orfs) == 0:
        return ''
    else:
        return seq[orfs[0][0]:orfs[0][1]]
final_orf = longest_orf(seq)



def translate(seq):
    table={}
    table = {
        'ata':'I', 'atc':'I', 'att':'I', 'atg':'M',
        'aca':'T', 'acc':'T', 'acg':'T', 'act':'T',
        'aac':'N', 'aat':'N', 'aaa':'K', 'aag':'K',
        'agc':'S', 'agt':'S', 'aga':'R', 'agg':'R',                
        'cta':'L', 'ctc':'L', 'ctg':'L', 'ctt':'L',
        'cca':'P', 'ccc':'P', 'ccg':'P', 'cct':'P',
        'cac':'H', 'cat':'H', 'caa':'Q', 'cag':'Q',
        'cga':'R', 'cgc':'R', 'cgg':'R', 'cgt':'R',
        'gta':'V', 'gtc':'V', 'gtg':'V', 'gtt':'V',
        'gca':'A', 'gcc':'A', 'gcg':'A', 'gct':'A',
        'gac':'D', 'gat':'D', 'gaa':'E', 'gag':'E',
        'gga':'G', 'ggc':'G', 'ggg':'G', 'ggt':'G',
        'tca':'S', 'tcc':'S', 'tcg':'S', 'tct':'S',
        'ttc':'F', 'ttt':'F', 'tta':'L', 'ttg':'L',
        'tac':'Y', 'tat':'Y', 'taa':'_', 'tag':'_',
        'tgc':'C', 'tgt':'C', 'tga':'_', 'tgg':'W',
    }
    protein =""
    if len(seq)%3 == 0:
        for i in range(0, len(seq), 3):
            codon = seq[i:i + 3]
            protein+= table[codon]
            my_protein_file ="my_protein.fasta"
            with open(my_protein_file,"w") as translated_protein_file:
                translated_protein_file.write(">CLAUD\n")
                translated_protein_file.write(protein)
    return protein
translate(final_orf)
python orf frame • 608 views
ADD COMMENT

Login before adding your answer.

Traffic: 2740 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6