Question

Deleted:how to slice out required gene sequence from multiple sequences at once?

0

Entering edit mode

2.7 years ago

iqrajaved323 • 0

I want to slice out the SRK gene from multiple sequences that I have collected together in one Fasta file. I know the starting and ending position of the gene in all the sequences. I have used the primers that tells the position. I need to extract the region with less than 1200bp length. for example, I have two primers SLGF (forward primer) and PSB (reverse primer) that gives the information about starting and ending position of my required gene in every file individually. if I want to cut the sequence from the file 3P02 than I take the start position at 3464 (SLGF attached at this location) and end position at 4475 because ( PSB is attached at this location in the same file). but if I want to cut all the required genes having length less than 1200bp from all the files simultaneously rather than one by one how i can do that I am trying to slicing individually and it is working also but it is time consuming. I want a script that slice the SRK gene from all the sequences at once. can any one help me out in this? I only need to use two primers SLGF and PSB.

import os

from Bio import SearchIO

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

### Test sequence:
fasta = "/Users/Lenovo/Desktop/all_concat.fa" # concatenated all capsella slocus sequences with: 'cat *.fas > all_concat.fa'

### Primer sequences I tested
primers = {
    "SLGF" : "AGAACCTATGCATGGGTTGC",
    "SLGR" : "ATCTGACATAAAGATCTTGACC",
    "PSA" : "AGAACACTTGTATCTCCCGGT",
    "PSB" : "CAATCTGACATAAAGATCTTG",
    "SRK2F" : "ACGGGTGTGTGTATTTGGACTGGA",
    "SRK4R" : "TTTCGGTGGCTTTGACAACAG",
    "SRK5R" : "ACAGCTTCTAACTCTATCAATGGA",
    "SRK_G23957" : "CTTCGAGCTTGGTTTCTTCA", 
    "PUB8_G10720" : "TTAATCTCCACTCTCATCTCTC",
    "PUB8_G10721" : "CTGATTCCTTTCTCTCCCTATC",
    "SRK1A" : "TTTCAACAACAACTCACAAGA ",
    "SRK3B" : "GTATAAACTCGAAATGCCTAG",
    "SRK6B" : "CTTACCCGAGTTGTAATCCTAC",
    "Aly137" : "GTGACTTCCCAAGAAAATTG",
    "SRK9A" : "GAGGTCACTGACACGTTCCTG",
    "SRK12A" : "CCAGATATGCGACAATGGACC",
    "SRK14B" : "CAGAGATGCAAAGATGGAAAG",
    "SRK16A" : "TTGGTTTGTTCACCATCTTGG",
    "SRK20A" : "AACAATGGGAATACATGGTCTC",
    "SRK22A" : "ACACGTTCTTAATGACCAACAAG",
    "SLGR" : "ATCTGACATAAAGATCTTGACC",
    "AP8F" : "ACGGGAACTCTCAAAATATCCG",
    "SRK5R" : "ACAGCTTCTAACTCTATCAATGGA",
}

primer_lengths = {k:len(v) for k, v in primers.items()}

records = SeqIO.parse(open("/Users/Lenovo/Desktop/all_concat.fa", "r"), "fasta")

SRK_sequences = {}
SRK_positions = {
    "Cgr-B-H6-5I19_corrected_ARK3-PUB8":[(178, 789)],
    "Cgr-B-H7-9M04_corrected_ARK3-PUB8":[(8896, 10080)],
    "Cgr-B-H6-3P02_corrected_ARK3-PUB8":[(3219, 4511), (5764, 6567)],
    "Cgr-B-H5-19I12_corrected_ARK3-PUB8":[(4100, 5254), (3, 743)],
    "Cgr-B-H5-14J22_corrected_ARK3-PUB8":[(4498, 5451)],
    "Cgr-B-H4-6I24_corrected_ARK3-PUB8":[(22298, 23251)],
    "Cgr-B-H4-1G22_corrected_ARK3-PUB8":[(4101, 5255), (184, 744)],
    "Cgr-B-H3-15B12_corrected_ARK3-PUB8":[(14326, 15480)],
    "Cgr-B-H3-7N24_corrected_ARK3-PUB8":[(22040, 22993)],
    "Cgr-B-H2-12I07_corrected_ARK3-PUB8":[(14340, 15494)],
    "Cgr-B-H2-1H03_corrected_ARK3-PUB8":[(4422, 5375)],
} # based on BLASTx

for r in records :
    try :
        tup = SRK_positions[r.id]
        for pos in tup :
            SRK_sequences[r.id + "_{}-{}".format(*pos)] = r.seq[pos[0]:pos[1]]
    except :
        print(r.id + " was not found!")

# put detected SRK sequences in fasta file
sequences = [SeqRecord(seq=srk, id=name, description="{}".format(name)) for name, srk in SRK_sequences.items()]

f = open("srk_sequences.fasta", "w")
SeqIO.write(sequences, f, "fasta")
f.close()

# put primers in fasta file
sequences = [ SeqRecord(seq=Seq(primer), id=name, description="Primer {}".format(name)) for name, primer in primers.items() ]

f = open("primers.fasta", "w")
SeqIO.write(sequences, f, "fasta")
f.close()

### Perform commands to establish fasta file as a BLAST'able DB

dbname = os.path.splitext(os.path.basename(fasta))[0] + "_db"
command = "makeblastdb -in {} -dbtype nucl -out {} -hash_index".format(fasta, dbname)
print(command)
os.system(command)

### BLAST primer sequence against 

query = "primers.fasta" # query is our primers
dbname = os.path.splitext(os.path.basename(fasta))[0] + "_db"
###Best to put the output somewhere specific.
blast_output = os.path.splitext(os.path.basename(fasta))[0] + "_primers.out"

command = "blastn -query {} -task blastn-short -db {} -out {} -outfmt 5 -num_alignments 10".format(query, dbname, blast_output)
print(command)
os.system(command)

output = str(blast_output)
print(output)

blast_results = SearchIO.parse(output, "blast-xml")
for res in blast_results :
    #print(res.hits)
    #print(dir(res.hits[0][0]))
    for hit in res.hits :
        lengths = []
        hits = []
        for hsp in hit :
            #print(hsp)
            lengths.append(int(hsp.query_span))
            hits.append(hsp)

        sorted_hits = [x for _, x in sorted(zip(lengths, hits), key=lambda pair: pair[0], reverse=True)]
        #print(sorted_hits)
        #print(dir(sorted_hits[0]))
        #print(sorted_hits[0])
        for el in sorted_hits[:3] :
            print(el.query_id, el.hit.id[:13], str(el.query_span) + "/" + str(primer_lengths[el.query.id]), el.hit_range, el.evalue)
        print("---------------------------------------------")

here is the out put

SLGF Cgr-B-H6-3P02 20/20 (3464, 3484) 0.000497564
SLGF Cgr-B-H6-3P02 10/20 (9161, 9171) 1.89381
---------------------------------------------
SLGF Cgr-B-H7-9M04 14/20 (9949, 9963) 1.89381
SLGF Cgr-B-H7-9M04 10/20 (17529, 17539) 1.89381
SLGF Cgr-B-H7-9M04 9/20 (6392, 6401) 7.48317
---------------------------------------------
SLGF Cgr-B-H6-5I19 9/20 (2290, 2299) 7.48317
SLGF Cgr-B-H6-5I19 9/20 (9057, 9066) 7.48317
---------------------------------------------
SLGF Cgr-B-H5-14J2 9/20 (16070, 16079) 7.48317
---------------------------------------------
SLGF Cgr-B-H4-6I24 9/20 (11665, 11674) 7.48317
---------------------------------------------
SLGF Cgr-B-H3-7N24 9/20 (11685, 11694) 7.48317
---------------------------------------------
SLGF Cgr-B-H2-1H03 9/20 (15868, 15877) 7.48317
---------------------------------------------
PSA Cgr-B-H2-1H03 17/21 (3937, 3954) 8.23148
PSA Cgr-B-H2-1H03 10/21 (3228, 3238) 2.0832
PSA Cgr-B-H2-1H03 9/21 (8467, 8476) 8.23148
---------------------------------------------
PSB Cgr-B-H5-19I1 17/21 (5218, 5235) 0.000138514
PSB Cgr-B-H5-19I1 10/21 (15122, 15132) 2.0832
PSB Cgr-B-H5-19I1 9/21 (7894, 7903) 8.23148
---------------------------------------------
PSB Cgr-B-H4-1G22 17/21 (5219, 5236) 0.000138514
PSB Cgr-B-H4-1G22 10/21 (15151, 15161) 2.0832
PSB Cgr-B-H4-1G22 9/21 (7901, 7910) 8.23148
---------------------------------------------
PSB Cgr-B-H3-15B1 17/21 (14344, 14361) 0.000138514
PSB Cgr-B-H3-15B1 10/21 (4399, 4409) 2.0832
PSB Cgr-B-H3-15B1 9/21 (7260, 7269) 8.23148
---------------------------------------------
PSB Cgr-B-H2-12I0 17/21 (14358, 14375) 0.000138514
PSB Cgr-B-H2-12I0 10/21 (4401, 4411) 2.0832
PSB Cgr-B-H2-12I0 9/21 (7264, 7273) 8.23148
---------------------------------------------
PSB Cgr-B-H7-9M04 14/21 (8923, 8937) 0.00854551
PSB Cgr-B-H7-9M04 13/21 (12931, 12944) 8.23148
PSB Cgr-B-H7-9M04 9/21 (8317, 8326) 8.23148
---------------------------------------------
PSB Cgr-B-H6-3P02 18/21 (4475, 4493) 0.00854551
---------------------------------------------
PSB Cgr-B-H6-5I19 17/21 (17170, 17187) 8.23148
PSB Cgr-B-H6-5I19 13/21 (7245, 7258) 8.23148
PSB Cgr-B-H6-5I19 10/21 (10116, 10126) 2.0832
---------------------------------------------
PSB Cgr-B-H5-14J2 9/21 (17437, 17446) 8.23148
PSB Cgr-B-H5-14J2 9/21 (17438, 17447) 8.23148
PSB Cgr-B-H5-14J2 9/21 (21886, 21895) 8.23148
---------------------------------------------
PSB Cgr-B-H4-6I24 9/21 (5857, 5866) 8.23148
PSB Cgr-B-H4-6I24 9/21 (10301, 10310) 8.23148
PSB Cgr-B-H4-6I24 9/21 (10302, 10311) 8.23148
---------------------------------------------
PSB Cgr-B-H3-7N24 9/21 (5860, 5869) 8.23148
PSB Cgr-B-H3-7N24 9/21 (10330, 10339) 8.23148
PSB Cgr-B-H3-7N24 9/21 (10331, 10340) 8.23148
---------------------------------------------

I am trying to do like this but need a script that slice the required gene from all 11 sequences at once.

concat_fasta = "/Users/Lenovo/Desktop/all_concat.fa"  
with open(concat_fasta, "r") as concat_in:  
    lines = concat_in.readlines()  
    for i in range(0, len(lines)):  
        if "3P02" in lines[i]:  
            SRK = lines[i +1]  
print(SRK[3464:4475])

python • 572 views

ADD COMMENT • link updated 2.7 years ago by Ram 43k • written 2.7 years ago by iqrajaved323 • 0