Question

Entropy Of Dna Sequences

0

Entering edit mode

10.3 years ago

mary • 0

Hi, I want to calculate the entropy of three sequences in python

CGTAA
AATCC
CCGCA

the formula of entropy is:

Entropy (A[:][i])= - ∑  p_ia log( p_ia)
p_ia = C_ia /∑  C_ia'

where p_ia is the probability of letter a in column i ,and C_ia is the number of letter a in the column A[:][i]

example: from the above 3 sequences: first column:a' take the values C , A and C, here C_ia'= 3 , if a='C' C_ia=2 so p_ia=2 /3,

if a='A' C_ia=1 so p_ia=1 /3

Entropy (A[:][0])=-(2/3 log (2/3)+1/3 log(1/3))

the same thing for the rest of columns.

python • 7.4k views

ADD COMMENT • link updated 10.3 years ago by pld 5.1k • written 10.3 years ago by mary • 0

5

Entering edit mode

There's no question here. If you are trying to outsource work, please forward half your monthly paycheck to me and I will email you the solution.

ADD REPLY • link 10.3 years ago by karl.stamm 4.1k

score 4 · Answer 1 · 2014-01-14

Here is a functional approach to the problem, it was designed to scan long sequences measuring the entropy with a n length sliding window.:

import sys
import numpy as np
import inspect
import itertools
##deal with X11 default issue 
#import matplotlib
#matplotlib.use('Agg')
from pylab import *
from matplotlib import pyplot as plt

#constants
EPSILON = np.float64(1.0*10**-50)
DNA = ['A','C','G','T']
GAP = '-'

#functions for applying functions
def apply(f, x):
    return f(x)

def mapply(f, x):
    return map(f, x)

def rapply(f, x, red):
    return reduce(red, map(f,x))

def np_apply(f, x, type):
    #print '#################'
    #print 'fun: ' + inspect.getsource(f)
    #print 'arg: ' + str(x)
    #print 'map: ' + str(map(f,x))
    #print 'np: ' + str(np.array(map(f,x), dtype = type))
    #print '@@@@@@@@@@@@@@@@@'
    return np.array(map(f, x), dtype = type)

def gen_apply(val): 
    return lambda f: apply(f, val)

def gen_mapply(vals):
    return lambda f: mapply(f, vals)

def gen_rapply(vals, red):
    return lambda f: rapply(f, vals, red)

#generates lambdas that return
#1 if the lambda argument matches
#the argument here else 0
def comparator(val, increment):
    return lambda x: increment if x == val else np.float64(0)

#generates comparators for all chars in alphabet
def gen_comparators(alpha, increment): 
    return map(lambda x: comparator(x, increment), alpha)

#functions for counting on sequences
def prior_norm_count(seqs):
    return np.float64(1.0 / (len(seqs) * len(seqs[0])))

#normalize counts for positional entropies
def post_norm_count(seqs):
    return np.float64(1.0 / len(seqs))

#generates a version of the apply function where the argument
#is valued to a list of characters present at the given 
#position in the alignment
def gen_position(seqs, pos, wlen): 
    return map(lambda x: x[pos:pos+wlen], seqs)

#generates positional functions for all positions in alignment
def gen_positions(seqs, alpha):
    return map(lambda x: gen_position(seqs, x, len(alpha[0])), 
                                                        xrange(len(seqs[0])))
#returns 1 if position has gaps else 0
def pos_gapped(pos):
    if True in map(lambda x: True if GAP in x else False, pos):
        return 1
    else:
        return 0

#returns a vector of positions containing gaps
def get_gaps(seqs, wlen):
    return np_apply(lambda x: x != None, np_apply(lambda x: x if pos_gapped(gen_position(seqs, x, wlen)) is 1 else -1, xrange(len(seqs[0])), np.int64), np.int64)


#returns a vector for the count of each char in alpha for the provided
#position
def count_one_pos(comps, pos):
    return np_apply(lambda x: rapply(x, pos, lambda x,y:x+y), comps, 
                                                            np.float64)

#returns a matrix of the counts of each char in alpha for each
#position in the alignment
def count_all_pos(comps, seqs, alpha):
    return np_apply(lambda x: count_one_pos(comps, x), 
                                                 gen_positions(seqs, alpha),
                                                                 np.float64)

#returns the a vector of the total counts for each char in alpha
#keeping position of char for position of counrs
def count_total(counts):
    return np_apply(lambda x: counts[:,x].sum(), xrange(len(counts[0])),
                                                              np.float64)
#functions for entropy calculations
#add epsilon to all values of a matrix
def add_epsilon(mat): return mat[:] + EPSILON

#calculate p*log2(p) values
def plogp(mat): return mat[:] * np.log2(mat[:])

#calculate prior entropy
def a_priori(seqs, alpha):
    return plogp(add_epsilon(count_total(count_all_pos(gen_comparators(alpha, prior_norm_count(seqs)), seqs, alpha)))).sum() * -1.0

#calculate positional entropy
def positional_entropy(seqs, alpha):
    return np_apply(lambda x: plogp(x).sum() * -1.0, add_epsilon(count_all_pos(gen_comparators(alpha, post_norm_count(seqs)), seqs, alpha)), np.float64)

#calculate positional ic
def positional_ic(seqs, alpha, prior):
    return prior - positional_entropy(seqs, alpha)[:]

#parse fasta formatted alignment
#returns a list of sequences
def fasta(indir):
    return [reduce(lambda x,y: x+y, x.split('\n')) for x in open(indir, 'rb').read().split('>')[1:]]

#generate alphabet of nmers
def gen_nmer(alpha, size):
    if size == 1: return alpha
    else: return [reduce(lambda x, y: x+y, x) for x in list(itertools.product(alpha, repeat=size))]

#main
def __main__():
    alpha = gen_nmer(DNA, int(sys.argv[2]))
    seqs  = fasta(sys.argv[1])
    count = prior_norm_count(seqs)
    prior = a_priori(seqs, alpha)
    post  = positional_ic(seqs, alpha, prior)

    #calculate gapped positions
    gpos = filter(lambda x: x != -1, get_gaps(seqs, int(sys.argv[2])))
    x    = np.array(xrange(len(seqs[0])))
    npos = filter(lambda x: x not in gpos, x)    

    #other statistics
    avg = np.empty(len(seqs[0]), dtype = np.float64)
    avg.fill(np.mean(post))
    std = np.empty(len(seqs[0]), dtype = np.float64)
    std.fill(np.std(post))

    #generate plots
    plt.figure(1)
    plt.plot(x[gpos], post[gpos], 'r', label = 'Gapped Positions') 
    plt.plot(x[npos], post[npos], 'g', label = 'Ungapped Positions')
    plt.plot(x, avg, 'b', label = 'Average IC')
    plt.plot(x, avg + std, 'y', label = 'Mean+1 std')
    plt.plot(x, avg - std, 'y', label = 'Mean-1 std')
    plt.savefig(sys.argv[3], dpi=1024, format='pdf')

#entry
__main__()

Karl does make a point, not in the best way, but if you're looking for a homework problem you should really try this on your own. Calculating sequence entropy is a very basic task but it can be a really fun coding problem. Because the calculations are so flexible, you can really explore programming solutions. Especially with python where you have lambdas, list comprehensions and some other fun tools. This is fun in Haskell as well.