Entering edit mode
4.4 years ago
moreirasarasousa
▴
10
Hi !
I'm trying to create a code, capable of sequencing 2 sequences, globaly and localy. This is what i have so far:
from Bio import SeqIO, Seq
from Bio.SeqIO import read
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein
from Bio.Align import substitution_matrices
# https://www.tutorialspoint.com/biopython/index.htm <- bons tutoriais!
# http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html <- parse das sequencias
# http://biopython.org/DIST/docs/api/Bio.Seq.Seq-class.html
# http://biopython.org/DIST/docs/tutorial/Tutorial.html
class Alignment():
def __init__(self, submat, gap=-1):
self.sm = submat
self.g = gap
self.seq1 = None
self.seq2 = None
self.S = None
self.T = None
def score_pos (self, c1, c2):
if c1 == "-" or c2=="-":
return self.g
else:
return self.sm[c1,c2]
def needleman_Wunsch(self, seq1, seq2):
if (seq1.alphabet != seq2.alphabet): return None
# ^ "AttributeError: 'Seq' object has no attribute 'seq_type'
# Solution: substituí seq_type por alphabet (atributo da classe Seq)
self.S = [[0]]
self.T = [[0]]
self.seq1 = seq1
self.seq2 = seq2
for j in range(1, len(seq2) + 1):
self.S[0].append(self.g * j)
self.T[0].append(3)
for i in range(1, len(seq1) + 1):
self.S.append([self.g * i])
self.T.append([2])
for i in range(0, len(seq1)):
for j in range(len(seq2)):
s1 = self.S[i][j] + self.score_pos(seq1[i], seq2[j])
s2 = self.S[i][j + 1] + self.g
s3 = self.S[i + 1][j] + self.g
self.S[i + 1].append(max(s1, s2, s3))
self.T[i + 1].append(max3t(s1, s2, s3))
return self.S[len(seq1)][len(seq2)]
class Protein(Alignment):
def __init__(self, submat, gap):
self.prot1 = None
self.prot2 = None
super().__init__(submat, gap)
# metodos não pertencentes à classe
def max3t (v1, v2, v3):
if v1 > v2:
if v1 > v3: return 1
else: return 3
else:
if v2 > v3: return 2
else: return 3
def read_submat_file(filename):
sm = {}
f = open(filename, "r")
line = f.readline()
tokens = line.split("\t")
ns = len(tokens)
alphabet = []
for i in range(0, ns):
alphabet.append(tokens[i][0])
for i in range(0, ns):
line = f.readline()
tokens = line.split("\t")
for j in range(0, len(tokens)):
k = alphabet[i] + alphabet[j]
sm[k] = int(tokens[j])
return sm
def printMat (mat):
for i in range(0, len(mat)):
print(mat[i])
#
# config data
subMat = substitution_matrices.load("BLOSUM62") # ler matriz de subs.
#subMat = read_submat_file("blosum62.mat")
gap = -3
# sequencias a alinhar
file1 = SeqIO.read("sequence1.fasta", "fasta")
file2 = SeqIO.read("sequence2.fasta", "fasta")
seq1 = file1.seq
seq2 = file2.seq
dna_obj = Alignment(subMat, gap)
res = dna_obj.needleman_Wunsch(seq1, seq2) # res dá um numero
# o resto dá erro:
S = res[0] # IndexError: invalid index to scalar variable.
T = res[1]
print("Score of optimal alignment:", S[len(seq1)][len(seq2)])
dna_obj.print_mat(S)
dna_obj.print_mat(T)
align_obj = dna_obj.recover_align(T, seq1, seq2)
print(align_obj[0])
print(align_obj[1])
Recently i understood that, i need to add this part : return self.S[len(seq1)][len(seq2)] -> ref self.S, self.T, in order to correct and error in the substitution_matrices.load function.
But i dont know what to do next. Can anybody help me please ?!?!
Is this for an assignment or something? I assume you know Biopython already has a very capable pairwise sequence alignment module (since you're already importing matrices)?
Any reason you're reinventing the wheel without saving yourself a dependency?
Yes, it is for an assignment.
All right you created the scoring matrix with your
needleman_Wunsch
function and you get the optimal score. But following lines are not neededres
variable is not matrix just the scoreIf you want to print the matrixes it should be something like following or printMat function needs be inside Alignment class
And in the following lines are for the aligned strings but recover_align function is missing in the Alignment class you need to write that function
Thank you, i will check it out.
No problem. I updated my answer with some additional detail of your code.