Question

orf finder

0

Entering edit mode

2.5 years ago

e5cf5013 • 0

How can I find which frame is producing the final protein? Is there any way to set all the frames?

import re

filename = input('Enter name of file to parse: ')
sequences = []
descr = None
# here is the path of multifalsta file

with open(filename) as file:
    line = file.readline()[:-1]  # always trim newline
    while line:
        if line[0] == '>':
            if descr:  # any sequence found yet?
                sequences.append((descr, seq))
            descr = str(line[1:].split('>'))
            seq = ''  # start a new sequence
        else:
            seq += line
        line = file.readline()[:-1]
    sequences.append((descr, seq))


def find_all_starts(seq):
    """Find the starting index of all start codons in a lowercase seq"""
    # Compile regex for start codons
    regex_start = re.compile('atg')

    # Find the indices of all start codons
    starts = []
    for match in regex_start.finditer(seq):
        starts.append(match.start())

    return tuple(starts)

find_all_starts(seq)


def find_first_in_register_stop(seq):
    """
    Find first stop codon on lowercase seq that starts at an index
    that is divisible by three
    """
    # Compile regexes for stop codons
    regex_stop = re.compile('(taa|tag|tga)')

    # Stop codon iterator
    stop_iterator = regex_stop.finditer(seq)

    # Find next stop codon that is in register
    for stop in stop_iterator:
        if stop.end() % 3 == 0:
            return stop.end()

    # Return -1 if we failed to find a stop codon
    return -1
find_first_in_register_stop(seq)



def all_orfs(seq):
    """Return all ORFs of a sequence."""
    # Make sure sequence is all lower case
    seq = seq.lower()

    # Find the indices of all start codons
    start_inds = find_all_starts(seq)

    # Keep track of stops
    stop_inds = []

    # Initialze ORFs.  Each entry in list is [ORF length, ORF start, ORF stop]
    orfs = []

    # For each start codon, find the next stop codon in register
    for start in start_inds:
        relative_stop = find_first_in_register_stop(seq[start:])

        if relative_stop != -1:
            # Index of stop codon
            stop = start + relative_stop

            # If already had stop, a longer ORF contains this one
            if stop not in stop_inds:
                orfs.append((relative_stop, start, stop))
                stop_inds.append(stop)

    # Get sorted list of ORF length
    orfs = sorted(orfs, reverse=True)

    # Remove lengths
    for i, orf in enumerate(orfs):
        orfs[i] = (orf[1], orf[2])

    return tuple(orfs)

all_orfs(seq)



def longest_orf(seq):
    """Longest ORF of a sequence."""
    orfs = all_orfs(seq)

    if len(orfs) == 0:
        return ''
    else:
        return seq[orfs[0][0]:orfs[0][1]]
final_orf = longest_orf(seq)



def translate(seq):
    table={}
    table = {
        'ata':'I', 'atc':'I', 'att':'I', 'atg':'M',
        'aca':'T', 'acc':'T', 'acg':'T', 'act':'T',
        'aac':'N', 'aat':'N', 'aaa':'K', 'aag':'K',
        'agc':'S', 'agt':'S', 'aga':'R', 'agg':'R',                
        'cta':'L', 'ctc':'L', 'ctg':'L', 'ctt':'L',
        'cca':'P', 'ccc':'P', 'ccg':'P', 'cct':'P',
        'cac':'H', 'cat':'H', 'caa':'Q', 'cag':'Q',
        'cga':'R', 'cgc':'R', 'cgg':'R', 'cgt':'R',
        'gta':'V', 'gtc':'V', 'gtg':'V', 'gtt':'V',
        'gca':'A', 'gcc':'A', 'gcg':'A', 'gct':'A',
        'gac':'D', 'gat':'D', 'gaa':'E', 'gag':'E',
        'gga':'G', 'ggc':'G', 'ggg':'G', 'ggt':'G',
        'tca':'S', 'tcc':'S', 'tcg':'S', 'tct':'S',
        'ttc':'F', 'ttt':'F', 'tta':'L', 'ttg':'L',
        'tac':'Y', 'tat':'Y', 'taa':'_', 'tag':'_',
        'tgc':'C', 'tgt':'C', 'tga':'_', 'tgg':'W',
    }
    protein =""
    if len(seq)%3 == 0:
        for i in range(0, len(seq), 3):
            codon = seq[i:i + 3]
            protein+= table[codon]
            my_protein_file ="my_protein.fasta"
            with open(my_protein_file,"w") as translated_protein_file:
                translated_protein_file.write(">CLAUD\n")
                translated_protein_file.write(protein)
    return protein
translate(final_orf)

python orf frame • 608 views

ADD COMMENT • link updated 2.5 years ago by GenoMax 141k • written 2.5 years ago by e5cf5013 • 0