Question

Rna Splicing Python Script

0

Entering edit mode

10.0 years ago

gnovikov87 • 0

Below you can find the simple script for the introns detection along the given DNA sequence

from re import match

f= open('data.txt', 'r').readlines()[1::2]
u = [element[:-1] for element in f]
chain=u[0]
exons=[]

#detect introns in chains
for I in u:
    for j in range(len(chain)):
        if i==chain: #pass sequence itself
            pass
        else:
            if match(i, chain[j::]):
                print "Intron %s has been detected in %d position" % (i, j)
                #Statement for place selection consisted of chain without current intron to the exons list

where the input data.txt consists of chain in upper instance and intron sub-strings as the further instances

>Rosalind_10 # chain
ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
>Rosalind_12 #intron
ATCGGTCGAA
>Rosalind_15 #intron
ATCGGTCGAGCGTGT

Now I'm looking for the best definition of the slice statement which will select exon sequence only (chain with deleted intron in found position) in the each loop and place it to the exon lists. How It could be done easily ?

Thanks for help,
Gleb

python • 7.8k views

ADD COMMENT • link updated 13 months ago by Ram 43k • written 10.0 years ago by gnovikov87 • 0

score 0 · Answer 1 · 2014-04-12

This uses biopython to make things a bit easier, but you can see how the general principals apply to what you already have:

#!/usr/bin/env python
from Bio import SeqIO

handle = open("data.txt", "r")
lines = list(SeqIO.parse(handle, "fasta"))
chain = lines[0]
lines = lines[1:]
last_start=0
exons = []
for line in lines :
    start = chain.seq.find(line.seq)
    if(start > 0) :
        print("Intron %s has been detected in %i position" % (line.seq, start+1))
        exons.append([chain.seq[last_start:start]])
        last_start = start + len(line.seq)
exons.append([chain.seq[last_start:]])
print(exons)

UPDATE: The above version assumes that your introns are ordered, since that's how things were in your original example. Since you apparently also have cases where that's not true, the idea would be to just put the introns in an array, sort that, and then iterate through as in the example above. BTW, there's no need to write a function to translate things.

Here's one possible implementation:

#!/usr/bin/env python
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

handle = open("data.txt", "r")
lines = list(SeqIO.parse(handle, "fasta"))
chain = lines[0]
lines = lines[1:]

#Determine where the introns are
introns = []
for line in lines :
    start = chain.seq.find(line.seq)
    if(start > 0) :
        introns.append([start,start + len(line.seq)-1])
        print("Intron %s has been detected in %i position" % (line.seq, start+1))
introns.append([start,start + len(line.seq)-1])
introns.sort()

last_start=0
ORF = Seq("",generic_dna)
#Add the exons to the ORF
for intron in introns :
    ORF += chain[last_start:intron[0]]
    last_start = intron[1]+1
ORF += chain[last_start:]
print(ORF.seq.translate())

score 0 · Answer 2 · 2014-04-13

Many thanks for suggestions!

Today I've tried to make script for the conversion of the spliced transcripts to the protein sequence using above bio pythone code. Here I passed def function for the conversion of the RNA to protein (It works good).

This script works good for short sequences provided in first example but made wrong results for the long chain with many introns like in the below example. Please keep in mind that the "exons_seq" variable which consist of list of only sequences of every exons also include some empty elements which might be source of error.

from Bio import SeqIO



def translate_dna(sequence):
    gencode = { 'ATA':'I',
    'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T',
    'ACT':'T', 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S',
    'AGT':'S', 'AGA':'R', 'AGG':'R', 'CTA':'L', 'CTC':'L', 'CTG':'L',
    'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAC':'H',
    'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R',
    'CGT':'R', 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A',
    'GCC':'A', 'GCG':'A', 'GCT':'A', 'GAC':'D', 'GAT':'D', 'GAA':'E',
    'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'TCA':'S',
    'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L',
    'TTG':'L', 'TAC':'Y', 'TAT':'Y', 'TAA':'', 'TAG':'', 'TGC':'C',
    'TGT':'C', 'TGA':'', 'TGG':'W', }
    #print sequence
    proteinseq = ''

    for n in range(0,len(sequence),3):
        if gencode.has_key(sequence[n:n+3]) == True:
            proteinseq += gencode[sequence[n:n+3]]
    print proteinseq


#slicing using biopython- for testing
handle = open("data.txt", "r")
lines = list(SeqIO.parse(handle, "fasta"))

chain = lines[0]
lines = lines[1:]
last_start=0
exons = []
for line in lines :
    start = chain.seq.find(line.seq)
    if(start > 0) :
        print("Intron %s has been detected in %i position" % (line.seq, start+1))
        exons.append([chain.seq[last_start:start]])
        last_start = start + len(line.seq)
exons.append([chain.seq[last_start:]])

otvet2=""
exons_seq = [' '.join([c.tostring() for c in lst]) for lst in exons]
xz="".join([otvet2+s for s in exons_seq])

translate_dna(xz)

here you can see real data.txt

>Rosalind_5942
ATGCGAGGTGTCGGTAACCCAGTCCGATGAAGGAGGAGCCGATAGATCTTCCCCGGGATT
ACATAAGGTTCTCTCCGGGACCATTCAAGTCCGCATTAGCGCGAGGCGGCTTACAACGTC
AAAGAATTGGTTACTTCGCTTTGGAAGCAATTCTGTTGCATGTGCCCCTGGTAATTAGGA
AAGTCTCTGAGGCCGGGCAATGCGACCAGCGCGATGATGGAAGATCATCCACCACTCTTG
GGATTTCAGAAGGATCAAGGTAACCGGGTTATGCGTTTATGTGTTGGTAATGCGTGTAAC
CGTCGGCCATGGCTCTAACATTGAAGAGGAATAGGGCTAGGCTGGCCACACTAGCTGTCG
AAAGTATTGTTGCTCTTGTCGGTCCCTGATAAGCCTCCATTCGATGGATAGTGCGACGTG
TTGGCAGCAACTACTCACCTAACTATCTACCTTGACGAGATAGATGTTTTTTGTTCTCCC
GCATCCTACAGGCGCACTTTGGTACGCAGTCTCAGCACTATTGTCCACAGCAGTGTAGTA
TATTGCCATGCAAAAGGTTCATGTCGATAGAGTCTATGTTAGGTCTGACAACCTGCCAAT
GTGGAGATTCTTTGCTTCTACGGCGATAAGTGCCAAATAGTTGGGCTAAACTGGCTTAGT
TTAGCCGAAGGATTGTCTGAGCTTTCTACTCGGTCAAAGGCGATGGGACCGTTCCCTTTG
ACGTCCGGTCCACTAGACAGAACCACGATGCCTACAACCCCTTGAGTACAATCAGCACAT
GATTCTGTCTTCTCCGCAGTTGGCTTTTAACAATCTTAGTAGTCCTGGAAGGCAATATTG
TATGGAGTCTTTAACGAGCGGGATGGCTCCTTGCTTCTTCTTGACGTCGAGTGTTACGTT
TGTTGTAGACTTGTTGCGAGTTGGGAATTCAATAGGGTACGTATATAG
>Rosalind_2822
TCCTACAGGCGCACTTTGGT
>Rosalind_8608
CTAAACTGGCTTAGTTTAGCCGAAGGAT
>Rosalind_5876
GAAGGCAATATTGTATGGAGTCTTT
>Rosalind_2322
TCCACCACTCTTGGGATTTCAGAAG
>Rosalind_9365
ACAACGTCAAAGAATTG
>Rosalind_9254
TGATTCTGTCTTCTC
>Rosalind_6250
TTCATGTCGATAGAGTCTATGTTAGGTCTGACAACCTGCCAATGTG
>Rosalind_4863
CGATGGGACCGTTCCCTTTGAC
>Rosalind_2573
TGGTAATTAGGAAAGTCTCTGAGGCCGGGCAATGC
>Rosalind_9081
TGTTGGCAGCAACTACT
>Rosalind_9138
TGGCTCTAACA
>Rosalind_8270
TTGCTTCTTCTTGACGTCGAGTGT
>Rosalind_8468
TGAAGGAGGAGCCGATAGATCTTCCCCGGGATTA
>Rosalind_9912
GTTGCTCTTGTCGGTCCCTGATAAGCCTCCAT