I want to slice out the SRK gene from multiple sequences that I have collected together in one Fasta file. I know the starting and ending position of the gene in all the sequences. I have used the primers that tells the position. I need to extract the region with less than 1200bp length. for example, I have two primers SLGF (forward primer) and PSB (reverse primer) that gives the information about starting and ending position of my required gene in every file individually. if I want to cut the sequence from the file 3P02 than I take the start position at 3464 (SLGF attached at this location) and end position at 4475 because ( PSB is attached at this location in the same file). but if I want to cut all the required genes having length less than 1200bp from all the files simultaneously rather than one by one how i can do that I am trying to slicing individually and it is working also but it is time consuming. I want a script that slice the SRK gene from all the sequences at once. can any one help me out in this? I only need to use two primers SLGF and PSB.
import os
from Bio import SearchIO
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
### Test sequence:
fasta = "/Users/Lenovo/Desktop/all_concat.fa" # concatenated all capsella slocus sequences with: 'cat *.fas > all_concat.fa'
### Primer sequences I tested
primers = {
"SLGF" : "AGAACCTATGCATGGGTTGC",
"SLGR" : "ATCTGACATAAAGATCTTGACC",
"PSA" : "AGAACACTTGTATCTCCCGGT",
"PSB" : "CAATCTGACATAAAGATCTTG",
"SRK2F" : "ACGGGTGTGTGTATTTGGACTGGA",
"SRK4R" : "TTTCGGTGGCTTTGACAACAG",
"SRK5R" : "ACAGCTTCTAACTCTATCAATGGA",
"SRK_G23957" : "CTTCGAGCTTGGTTTCTTCA",
"PUB8_G10720" : "TTAATCTCCACTCTCATCTCTC",
"PUB8_G10721" : "CTGATTCCTTTCTCTCCCTATC",
"SRK1A" : "TTTCAACAACAACTCACAAGA ",
"SRK3B" : "GTATAAACTCGAAATGCCTAG",
"SRK6B" : "CTTACCCGAGTTGTAATCCTAC",
"Aly137" : "GTGACTTCCCAAGAAAATTG",
"SRK9A" : "GAGGTCACTGACACGTTCCTG",
"SRK12A" : "CCAGATATGCGACAATGGACC",
"SRK14B" : "CAGAGATGCAAAGATGGAAAG",
"SRK16A" : "TTGGTTTGTTCACCATCTTGG",
"SRK20A" : "AACAATGGGAATACATGGTCTC",
"SRK22A" : "ACACGTTCTTAATGACCAACAAG",
"SLGR" : "ATCTGACATAAAGATCTTGACC",
"AP8F" : "ACGGGAACTCTCAAAATATCCG",
"SRK5R" : "ACAGCTTCTAACTCTATCAATGGA",
}
primer_lengths = {k:len(v) for k, v in primers.items()}
records = SeqIO.parse(open("/Users/Lenovo/Desktop/all_concat.fa", "r"), "fasta")
SRK_sequences = {}
SRK_positions = {
"Cgr-B-H6-5I19_corrected_ARK3-PUB8":[(178, 789)],
"Cgr-B-H7-9M04_corrected_ARK3-PUB8":[(8896, 10080)],
"Cgr-B-H6-3P02_corrected_ARK3-PUB8":[(3219, 4511), (5764, 6567)],
"Cgr-B-H5-19I12_corrected_ARK3-PUB8":[(4100, 5254), (3, 743)],
"Cgr-B-H5-14J22_corrected_ARK3-PUB8":[(4498, 5451)],
"Cgr-B-H4-6I24_corrected_ARK3-PUB8":[(22298, 23251)],
"Cgr-B-H4-1G22_corrected_ARK3-PUB8":[(4101, 5255), (184, 744)],
"Cgr-B-H3-15B12_corrected_ARK3-PUB8":[(14326, 15480)],
"Cgr-B-H3-7N24_corrected_ARK3-PUB8":[(22040, 22993)],
"Cgr-B-H2-12I07_corrected_ARK3-PUB8":[(14340, 15494)],
"Cgr-B-H2-1H03_corrected_ARK3-PUB8":[(4422, 5375)],
} # based on BLASTx
for r in records :
try :
tup = SRK_positions[r.id]
for pos in tup :
SRK_sequences[r.id + "_{}-{}".format(*pos)] = r.seq[pos[0]:pos[1]]
except :
print(r.id + " was not found!")
# put detected SRK sequences in fasta file
sequences = [SeqRecord(seq=srk, id=name, description="{}".format(name)) for name, srk in SRK_sequences.items()]
f = open("srk_sequences.fasta", "w")
SeqIO.write(sequences, f, "fasta")
f.close()
# put primers in fasta file
sequences = [ SeqRecord(seq=Seq(primer), id=name, description="Primer {}".format(name)) for name, primer in primers.items() ]
f = open("primers.fasta", "w")
SeqIO.write(sequences, f, "fasta")
f.close()
### Perform commands to establish fasta file as a BLAST'able DB
dbname = os.path.splitext(os.path.basename(fasta))[0] + "_db"
command = "makeblastdb -in {} -dbtype nucl -out {} -hash_index".format(fasta, dbname)
print(command)
os.system(command)
### BLAST primer sequence against
query = "primers.fasta" # query is our primers
dbname = os.path.splitext(os.path.basename(fasta))[0] + "_db"
###Best to put the output somewhere specific.
blast_output = os.path.splitext(os.path.basename(fasta))[0] + "_primers.out"
command = "blastn -query {} -task blastn-short -db {} -out {} -outfmt 5 -num_alignments 10".format(query, dbname, blast_output)
print(command)
os.system(command)
output = str(blast_output)
print(output)
blast_results = SearchIO.parse(output, "blast-xml")
for res in blast_results :
#print(res.hits)
#print(dir(res.hits[0][0]))
for hit in res.hits :
lengths = []
hits = []
for hsp in hit :
#print(hsp)
lengths.append(int(hsp.query_span))
hits.append(hsp)
sorted_hits = [x for _, x in sorted(zip(lengths, hits), key=lambda pair: pair[0], reverse=True)]
#print(sorted_hits)
#print(dir(sorted_hits[0]))
#print(sorted_hits[0])
for el in sorted_hits[:3] :
print(el.query_id, el.hit.id[:13], str(el.query_span) + "/" + str(primer_lengths[el.query.id]), el.hit_range, el.evalue)
print("---------------------------------------------")
here is the out put
SLGF Cgr-B-H6-3P02 20/20 (3464, 3484) 0.000497564
SLGF Cgr-B-H6-3P02 10/20 (9161, 9171) 1.89381
---------------------------------------------
SLGF Cgr-B-H7-9M04 14/20 (9949, 9963) 1.89381
SLGF Cgr-B-H7-9M04 10/20 (17529, 17539) 1.89381
SLGF Cgr-B-H7-9M04 9/20 (6392, 6401) 7.48317
---------------------------------------------
SLGF Cgr-B-H6-5I19 9/20 (2290, 2299) 7.48317
SLGF Cgr-B-H6-5I19 9/20 (9057, 9066) 7.48317
---------------------------------------------
SLGF Cgr-B-H5-14J2 9/20 (16070, 16079) 7.48317
---------------------------------------------
SLGF Cgr-B-H4-6I24 9/20 (11665, 11674) 7.48317
---------------------------------------------
SLGF Cgr-B-H3-7N24 9/20 (11685, 11694) 7.48317
---------------------------------------------
SLGF Cgr-B-H2-1H03 9/20 (15868, 15877) 7.48317
---------------------------------------------
PSA Cgr-B-H2-1H03 17/21 (3937, 3954) 8.23148
PSA Cgr-B-H2-1H03 10/21 (3228, 3238) 2.0832
PSA Cgr-B-H2-1H03 9/21 (8467, 8476) 8.23148
---------------------------------------------
PSB Cgr-B-H5-19I1 17/21 (5218, 5235) 0.000138514
PSB Cgr-B-H5-19I1 10/21 (15122, 15132) 2.0832
PSB Cgr-B-H5-19I1 9/21 (7894, 7903) 8.23148
---------------------------------------------
PSB Cgr-B-H4-1G22 17/21 (5219, 5236) 0.000138514
PSB Cgr-B-H4-1G22 10/21 (15151, 15161) 2.0832
PSB Cgr-B-H4-1G22 9/21 (7901, 7910) 8.23148
---------------------------------------------
PSB Cgr-B-H3-15B1 17/21 (14344, 14361) 0.000138514
PSB Cgr-B-H3-15B1 10/21 (4399, 4409) 2.0832
PSB Cgr-B-H3-15B1 9/21 (7260, 7269) 8.23148
---------------------------------------------
PSB Cgr-B-H2-12I0 17/21 (14358, 14375) 0.000138514
PSB Cgr-B-H2-12I0 10/21 (4401, 4411) 2.0832
PSB Cgr-B-H2-12I0 9/21 (7264, 7273) 8.23148
---------------------------------------------
PSB Cgr-B-H7-9M04 14/21 (8923, 8937) 0.00854551
PSB Cgr-B-H7-9M04 13/21 (12931, 12944) 8.23148
PSB Cgr-B-H7-9M04 9/21 (8317, 8326) 8.23148
---------------------------------------------
PSB Cgr-B-H6-3P02 18/21 (4475, 4493) 0.00854551
---------------------------------------------
PSB Cgr-B-H6-5I19 17/21 (17170, 17187) 8.23148
PSB Cgr-B-H6-5I19 13/21 (7245, 7258) 8.23148
PSB Cgr-B-H6-5I19 10/21 (10116, 10126) 2.0832
---------------------------------------------
PSB Cgr-B-H5-14J2 9/21 (17437, 17446) 8.23148
PSB Cgr-B-H5-14J2 9/21 (17438, 17447) 8.23148
PSB Cgr-B-H5-14J2 9/21 (21886, 21895) 8.23148
---------------------------------------------
PSB Cgr-B-H4-6I24 9/21 (5857, 5866) 8.23148
PSB Cgr-B-H4-6I24 9/21 (10301, 10310) 8.23148
PSB Cgr-B-H4-6I24 9/21 (10302, 10311) 8.23148
---------------------------------------------
PSB Cgr-B-H3-7N24 9/21 (5860, 5869) 8.23148
PSB Cgr-B-H3-7N24 9/21 (10330, 10339) 8.23148
PSB Cgr-B-H3-7N24 9/21 (10331, 10340) 8.23148
---------------------------------------------
I am trying to do like this but need a script that slice the required gene from all 11 sequences at once.
concat_fasta = "/Users/Lenovo/Desktop/all_concat.fa"
with open(concat_fasta, "r") as concat_in:
lines = concat_in.readlines()
for i in range(0, len(lines)):
if "3P02" in lines[i]:
SRK = lines[i +1]
print(SRK[3464:4475])