Deleted:how to slice out required gene sequence from multiple sequences at once?
0
0
Entering edit mode
2.7 years ago

I want to slice out the SRK gene from multiple sequences that I have collected together in one Fasta file. I know the starting and ending position of the gene in all the sequences. I have used the primers that tells the position. I need to extract the region with less than 1200bp length. for example, I have two primers SLGF (forward primer) and PSB (reverse primer) that gives the information about starting and ending position of my required gene in every file individually. if I want to cut the sequence from the file 3P02 than I take the start position at 3464 (SLGF attached at this location) and end position at 4475 because ( PSB is attached at this location in the same file). but if I want to cut all the required genes having length less than 1200bp from all the files simultaneously rather than one by one how i can do that I am trying to slicing individually and it is working also but it is time consuming. I want a script that slice the SRK gene from all the sequences at once. can any one help me out in this? I only need to use two primers SLGF and PSB.

import os

from Bio import SearchIO

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

### Test sequence:
fasta = "/Users/Lenovo/Desktop/all_concat.fa" # concatenated all capsella slocus sequences with: 'cat *.fas > all_concat.fa'

### Primer sequences I tested
primers = {
    "SLGF" : "AGAACCTATGCATGGGTTGC",
    "SLGR" : "ATCTGACATAAAGATCTTGACC",
    "PSA" : "AGAACACTTGTATCTCCCGGT",
    "PSB" : "CAATCTGACATAAAGATCTTG",
    "SRK2F" : "ACGGGTGTGTGTATTTGGACTGGA",
    "SRK4R" : "TTTCGGTGGCTTTGACAACAG",
    "SRK5R" : "ACAGCTTCTAACTCTATCAATGGA",
    "SRK_G23957" : "CTTCGAGCTTGGTTTCTTCA", 
    "PUB8_G10720" : "TTAATCTCCACTCTCATCTCTC",
    "PUB8_G10721" : "CTGATTCCTTTCTCTCCCTATC",
    "SRK1A" : "TTTCAACAACAACTCACAAGA ",
    "SRK3B" : "GTATAAACTCGAAATGCCTAG",
    "SRK6B" : "CTTACCCGAGTTGTAATCCTAC",
    "Aly137" : "GTGACTTCCCAAGAAAATTG",
    "SRK9A" : "GAGGTCACTGACACGTTCCTG",
    "SRK12A" : "CCAGATATGCGACAATGGACC",
    "SRK14B" : "CAGAGATGCAAAGATGGAAAG",
    "SRK16A" : "TTGGTTTGTTCACCATCTTGG",
    "SRK20A" : "AACAATGGGAATACATGGTCTC",
    "SRK22A" : "ACACGTTCTTAATGACCAACAAG",
    "SLGR" : "ATCTGACATAAAGATCTTGACC",
    "AP8F" : "ACGGGAACTCTCAAAATATCCG",
    "SRK5R" : "ACAGCTTCTAACTCTATCAATGGA",
}

primer_lengths = {k:len(v) for k, v in primers.items()}

records = SeqIO.parse(open("/Users/Lenovo/Desktop/all_concat.fa", "r"), "fasta")

SRK_sequences = {}
SRK_positions = {
    "Cgr-B-H6-5I19_corrected_ARK3-PUB8":[(178, 789)],
    "Cgr-B-H7-9M04_corrected_ARK3-PUB8":[(8896, 10080)],
    "Cgr-B-H6-3P02_corrected_ARK3-PUB8":[(3219, 4511), (5764, 6567)],
    "Cgr-B-H5-19I12_corrected_ARK3-PUB8":[(4100, 5254), (3, 743)],
    "Cgr-B-H5-14J22_corrected_ARK3-PUB8":[(4498, 5451)],
    "Cgr-B-H4-6I24_corrected_ARK3-PUB8":[(22298, 23251)],
    "Cgr-B-H4-1G22_corrected_ARK3-PUB8":[(4101, 5255), (184, 744)],
    "Cgr-B-H3-15B12_corrected_ARK3-PUB8":[(14326, 15480)],
    "Cgr-B-H3-7N24_corrected_ARK3-PUB8":[(22040, 22993)],
    "Cgr-B-H2-12I07_corrected_ARK3-PUB8":[(14340, 15494)],
    "Cgr-B-H2-1H03_corrected_ARK3-PUB8":[(4422, 5375)],
} # based on BLASTx

for r in records :
    try :
        tup = SRK_positions[r.id]
        for pos in tup :
            SRK_sequences[r.id + "_{}-{}".format(*pos)] = r.seq[pos[0]:pos[1]]
    except :
        print(r.id + " was not found!")

# put detected SRK sequences in fasta file
sequences = [SeqRecord(seq=srk, id=name, description="{}".format(name)) for name, srk in SRK_sequences.items()]

f = open("srk_sequences.fasta", "w")
SeqIO.write(sequences, f, "fasta")
f.close()

# put primers in fasta file
sequences = [ SeqRecord(seq=Seq(primer), id=name, description="Primer {}".format(name)) for name, primer in primers.items() ]

f = open("primers.fasta", "w")
SeqIO.write(sequences, f, "fasta")
f.close()

### Perform commands to establish fasta file as a BLAST'able DB

dbname = os.path.splitext(os.path.basename(fasta))[0] + "_db"
command = "makeblastdb -in {} -dbtype nucl -out {} -hash_index".format(fasta, dbname)
print(command)
os.system(command)

### BLAST primer sequence against 

query = "primers.fasta" # query is our primers
dbname = os.path.splitext(os.path.basename(fasta))[0] + "_db"
###Best to put the output somewhere specific.
blast_output = os.path.splitext(os.path.basename(fasta))[0] + "_primers.out"

command = "blastn -query {} -task blastn-short -db {} -out {} -outfmt 5 -num_alignments 10".format(query, dbname, blast_output)
print(command)
os.system(command)

output = str(blast_output)
print(output)

blast_results = SearchIO.parse(output, "blast-xml")
for res in blast_results :
    #print(res.hits)
    #print(dir(res.hits[0][0]))
    for hit in res.hits :
        lengths = []
        hits = []
        for hsp in hit :
            #print(hsp)
            lengths.append(int(hsp.query_span))
            hits.append(hsp)

        sorted_hits = [x for _, x in sorted(zip(lengths, hits), key=lambda pair: pair[0], reverse=True)]
        #print(sorted_hits)
        #print(dir(sorted_hits[0]))
        #print(sorted_hits[0])
        for el in sorted_hits[:3] :
            print(el.query_id, el.hit.id[:13], str(el.query_span) + "/" + str(primer_lengths[el.query.id]), el.hit_range, el.evalue)
        print("---------------------------------------------")

here is the out put

SLGF Cgr-B-H6-3P02 20/20 (3464, 3484) 0.000497564
SLGF Cgr-B-H6-3P02 10/20 (9161, 9171) 1.89381
---------------------------------------------
SLGF Cgr-B-H7-9M04 14/20 (9949, 9963) 1.89381
SLGF Cgr-B-H7-9M04 10/20 (17529, 17539) 1.89381
SLGF Cgr-B-H7-9M04 9/20 (6392, 6401) 7.48317
---------------------------------------------
SLGF Cgr-B-H6-5I19 9/20 (2290, 2299) 7.48317
SLGF Cgr-B-H6-5I19 9/20 (9057, 9066) 7.48317
---------------------------------------------
SLGF Cgr-B-H5-14J2 9/20 (16070, 16079) 7.48317
---------------------------------------------
SLGF Cgr-B-H4-6I24 9/20 (11665, 11674) 7.48317
---------------------------------------------
SLGF Cgr-B-H3-7N24 9/20 (11685, 11694) 7.48317
---------------------------------------------
SLGF Cgr-B-H2-1H03 9/20 (15868, 15877) 7.48317
---------------------------------------------
PSA Cgr-B-H2-1H03 17/21 (3937, 3954) 8.23148
PSA Cgr-B-H2-1H03 10/21 (3228, 3238) 2.0832
PSA Cgr-B-H2-1H03 9/21 (8467, 8476) 8.23148
---------------------------------------------
PSB Cgr-B-H5-19I1 17/21 (5218, 5235) 0.000138514
PSB Cgr-B-H5-19I1 10/21 (15122, 15132) 2.0832
PSB Cgr-B-H5-19I1 9/21 (7894, 7903) 8.23148
---------------------------------------------
PSB Cgr-B-H4-1G22 17/21 (5219, 5236) 0.000138514
PSB Cgr-B-H4-1G22 10/21 (15151, 15161) 2.0832
PSB Cgr-B-H4-1G22 9/21 (7901, 7910) 8.23148
---------------------------------------------
PSB Cgr-B-H3-15B1 17/21 (14344, 14361) 0.000138514
PSB Cgr-B-H3-15B1 10/21 (4399, 4409) 2.0832
PSB Cgr-B-H3-15B1 9/21 (7260, 7269) 8.23148
---------------------------------------------
PSB Cgr-B-H2-12I0 17/21 (14358, 14375) 0.000138514
PSB Cgr-B-H2-12I0 10/21 (4401, 4411) 2.0832
PSB Cgr-B-H2-12I0 9/21 (7264, 7273) 8.23148
---------------------------------------------
PSB Cgr-B-H7-9M04 14/21 (8923, 8937) 0.00854551
PSB Cgr-B-H7-9M04 13/21 (12931, 12944) 8.23148
PSB Cgr-B-H7-9M04 9/21 (8317, 8326) 8.23148
---------------------------------------------
PSB Cgr-B-H6-3P02 18/21 (4475, 4493) 0.00854551
---------------------------------------------
PSB Cgr-B-H6-5I19 17/21 (17170, 17187) 8.23148
PSB Cgr-B-H6-5I19 13/21 (7245, 7258) 8.23148
PSB Cgr-B-H6-5I19 10/21 (10116, 10126) 2.0832
---------------------------------------------
PSB Cgr-B-H5-14J2 9/21 (17437, 17446) 8.23148
PSB Cgr-B-H5-14J2 9/21 (17438, 17447) 8.23148
PSB Cgr-B-H5-14J2 9/21 (21886, 21895) 8.23148
---------------------------------------------
PSB Cgr-B-H4-6I24 9/21 (5857, 5866) 8.23148
PSB Cgr-B-H4-6I24 9/21 (10301, 10310) 8.23148
PSB Cgr-B-H4-6I24 9/21 (10302, 10311) 8.23148
---------------------------------------------
PSB Cgr-B-H3-7N24 9/21 (5860, 5869) 8.23148
PSB Cgr-B-H3-7N24 9/21 (10330, 10339) 8.23148
PSB Cgr-B-H3-7N24 9/21 (10331, 10340) 8.23148
---------------------------------------------

I am trying to do like this but need a script that slice the required gene from all 11 sequences at once.

concat_fasta = "/Users/Lenovo/Desktop/all_concat.fa"  
with open(concat_fasta, "r") as concat_in:  
    lines = concat_in.readlines()  
    for i in range(0, len(lines)):  
        if "3P02" in lines[i]:  
            SRK = lines[i +1]  
print(SRK[3464:4475])  
python • 572 views
ADD COMMENT
This thread is not open. No new answers may be added
Traffic: 2275 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6