Question

Python script for sequence alignment not running

0

Entering edit mode

4.1 years ago

er.doug.ragnar ▴ 20

def read_Fasta(filename):
    from re import sub, search
    res = []
    sequence = None
    info = None
    fh = open(filename)
    for line in fh:
        if search(">.*", line):
            if sequence is not None and info is not None and sequence != "":
                res.append(sequence)
            info = line
            sequence = ""
        else:
            if sequence is None:
                return None
            else:
                sequence += sub("\s", "", line)
    if sequence is not None and info is not None and sequence != "":
        res.append(sequence)
    fh.close()
    return res

def read_submat_file(filename):
    sm = {}
    f = open(filename, "r")
    line = f.readline()
    tokens = line.split("\t")
    ns = len(tokens)
    alphabet = []
    for i in range(0, ns):
        alphabet.append(tokens[i][0])
    for i in range(0, ns):
        line = f.readline()
        tokens = line.split("\t")
        for j in range(0, len(tokens)):
            k = alphabet[i] + alphabet[j]
            sm[k] = int(tokens[j])
    return sm

def needleman_wunsch (seq1, seq2, sm, g):
    """Global Alignment"""
    sm = read_submat_file(f)
    seq1 = read_Fasta("A.fasta")
    seq2 = read_Fasta("B.fasta")
    S = [[0]]
    T = [[0]]
    # initialize gaps in rows
    for j in range(1, len(seq2)+1):
        S[0].append(g * j)
        T[0].append(3)
    # initialize gaps in cols
    for i in range(1, len(seq1)+1):
        S.append([g * i])
        T.append([2])
    # apply the recurrence to fill the matrices
    for i in range(0, len(seq1)):
        for j in range(len(seq2)):
            s1 = S[i][j] + score_pos(seq1[i], seq2[j], sm, g)
            s2 = S[i][j+1] + g
            s3 = S[i+1][j] + g
            S[i+1].append(max(s1, s2, s3))
            T[i+1].append(max3t(s1, s2, s3))
    return (S, T)

def test_DNA_GlobalAlign():
    sm = read_submat_file("blosum62.mat")
    seq1 = read_Fasta("A.fasta")
    seq2 = read_Fasta("B.fasta")
    p = -3
    res = needleman_wunsch(seq1, seq2, sm, p)
    S = res[0]
    T = res[1]
    print("Score of optimal alignment:", S[len(seq1)][len(seq2)])
    print_mat(S)
    print_mat(T)
    alig = recover_align(T, seq1, seq2)
    print(alig[0])
    print(alig[1])

test_DNA_GlobalAlign()

I have been trying to run the python script to align 2 fasta files, it reads the files but I can't seem to use the previous functions on the subsequent functions. I get these errors.

>>> test_DNA_GlobalAlign()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 6, in test_DNA_GlobalAlign
  File "<stdin>", line 20, in needleman_wunsch
  File "<stdin>", line 6, in score_pos

python alignment • 970 views

ADD COMMENT • link updated 4.1 years ago by Mensur Dlakic ★ 27k • written 4.1 years ago by er.doug.ragnar ▴ 20

score 1 · Answer 1 · 2020-04-05

1

Entering edit mode

4.1 years ago

Mensur Dlakic ★ 27k

You probably need to delete these lines from needleman_wunsch:

sm = read_submat_file(f)
seq1 = read_Fasta("A.fasta")
seq2 = read_Fasta("B.fasta")

By the way, you can achieve this task with a dozen or so lines of code in Biopython.

ADD COMMENT • link 4.1 years ago by Mensur Dlakic ★ 27k