How to calculate differential coverage between two groups?
0
0
Entering edit mode
3.1 years ago
O.rka ▴ 710

I have 2 groups: treatment and control. In the treatment group, we hypothesize that there is lower coverage on the 3' region of the CDS in the treatment. I've mapped all of the samples to the CDS sequences, converted to bed, and I'm kind of stuck.

There has to be a better way to statistically quantify if one group of samples has differential coverage on regions than another group.

I did the following but it seems full of assumptions (note, I hacked this together very quick so bear with me):

from collections import *
import sys, os, glob
import numpy as np
import soothsayer_utils as syu # This is a utility package I wrote
from scipy import stats

# Get cds sequences
seqs = syu.read_fasta("../assembly_run-25/eco.genes.fna")
data = defaultdict(dict)

# Initialize arrays filled zeros for each gene in each sample
for fp in glob.glob("*.bam.bedtools.cov"): # The output of bedtools coverage
    id_sample = fp.split(".bam")[0]
    for id_gene, length in seqs.map(len).items():
        data[id_sample][id_gene] = np.zeros(length)

# Get the regions covered by the read and add 1
for fp in glob.glob("*.bam.bedtools.cov"):
    id_sample = fp.split(".bam")[0]
    for line in syu.pv(syu.read_textfile(fp), id_sample):
        fields = line.split("\t")
        if fields[0] != ".":
            id_gene = fields[0]
            start = int(fields[1]) - 1
            end = int(fields[2])
            data[id_sample][id_gene][start:end] += 1

# Get the total number of reads per sample
sample_to_depth = defaultdict(int)
for fp in glob.glob("*.bam.bedtools.cov"):
    id_sample = fp.split(".bam")[0]
    with open(fp, "r") as f:
        for i, line in sy.pv(enumerate(f), id_sample):
            sample_to_depth[id_sample] += 1

# Create 2 dictionaries, one per group, and then make
# key=gene and value=array of coverages
data_solvents = defaultdict(list)
data_treatments = defaultdict(list)

solvents = ["DH2O_BR3-PE-P42-1_S42_001", "DH2O_BR2-PE-P28-1_S28_001", "DH2O_BR1-PE-P14-1_S14_001"]
treatments = ["NU_C_12_BR2-PE-P24-1_S24_001", "NU_C_12_BR3-PE-P38-1_S38_001", "NU_C_12_BR1-PE-P10-1_S10_001"]

for id_sample, coverages in data.items():
    if id_sample in solvents:
        depth = sample_to_depth[id_sample]
        for id_gene, covs in coverages.items():
#             covs = covs/depth
            data_solvents[id_gene].append(covs)
    if id_sample in treatments:
        depth = sample_to_depth[id_sample]
        for id_gene, covs in coverages.items():
#             covs = covs/depth
            data_treatments[id_gene].append(covs)

for d in [data_solvents, data_treatments]:
    for id_gene, covs in d.items():
        A = np.stack(covs, axis=0)
        d[id_gene] = A.mean(axis=0)

# Get medians for percentils
def f(x):
    n = len(x)
    p_25 = int(n*0.25)
    p_50 = int(n*0.50)
    p_75 = int(n*0.75)
    p_100 = int(n*1.0)

    percentiles = [0, p_25, p_50, p_75, p_100]
    output = [0,0,0,0]
    for i in range(len(percentiles)-1):
        output[i] = np.median(x[percentiles[i]:percentiles[i+1]])
    return np.asarray(output)

output_solvents = dict()
output_treatments = dict()


for id_gene, medians in data_solvents.items():
    output_solvents[id_gene] = f(medians)

for id_gene, medians in data_treatments.items():
    output_treatments[id_gene] = f(medians)

# Organize into DataFrames
df_solvents = pd.DataFrame(output_solvents, index=["0-25", "25-50", "50-75", "75-100"]).T.dropna(how="all", axis=0)
df_treatments = pd.DataFrame(output_treatments, index=["0-25", "25-50", "50-75", "75-100"]).T.dropna(how="all", axis=0)

output_p = dict()
output_diff = dict()
for column in df_solvents.columns:
    u = df_solvents[column]
    v = df_treatments[column]
    output_p[column] = stats.wilcoxon(u, v)[-1]
    output_diff[column] = v.median() - u.median()
output_p
# {'0-25': 2.7590531087162958e-117,
#  '25-50': 5.163056594586972e-151,
#  '50-75': 4.137767887055633e-149,
#  '75-100': 5.673772630059623e-134}

df_solvents
#   0-25    25-50   50-75   75-100
# EG11277   0.333333    0.333333    0.333333    0.333333
# EG10998   209.333333  171.000000  250.000000  227.500000
# EG10999   241.000000  125.333333  422.666667  238.833333
# EG11000   257.000000  293.333333  246.333333  204.333333
# G6081 10.833333   61.333333   144.666667  114.333333
differential bed bam rnaseq coverage • 570 views
ADD COMMENT

Login before adding your answer.

Traffic: 2822 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6