How to calculate differential coverage between two groups?
0
0
Entering edit mode
15 months ago
O.rka ▴ 600

I have 2 groups: treatment and control. In the treatment group, we hypothesize that there is lower coverage on the 3' region of the CDS in the treatment. I've mapped all of the samples to the CDS sequences, converted to bed, and I'm kind of stuck.

There has to be a better way to statistically quantify if one group of samples has differential coverage on regions than another group.

I did the following but it seems full of assumptions (note, I hacked this together very quick so bear with me):

from collections import *
import sys, os, glob
import numpy as np
import soothsayer_utils as syu # This is a utility package I wrote
from scipy import stats

# Get cds sequences
data = defaultdict(dict)

# Initialize arrays filled zeros for each gene in each sample
for fp in glob.glob("*.bam.bedtools.cov"): # The output of bedtools coverage
id_sample = fp.split(".bam")
for id_gene, length in seqs.map(len).items():
data[id_sample][id_gene] = np.zeros(length)

for fp in glob.glob("*.bam.bedtools.cov"):
id_sample = fp.split(".bam")
fields = line.split("\t")
if fields != ".":
id_gene = fields
start = int(fields) - 1
end = int(fields)
data[id_sample][id_gene][start:end] += 1

# Get the total number of reads per sample
sample_to_depth = defaultdict(int)
for fp in glob.glob("*.bam.bedtools.cov"):
id_sample = fp.split(".bam")
with open(fp, "r") as f:
for i, line in sy.pv(enumerate(f), id_sample):
sample_to_depth[id_sample] += 1

# Create 2 dictionaries, one per group, and then make
# key=gene and value=array of coverages
data_solvents = defaultdict(list)
data_treatments = defaultdict(list)

solvents = ["DH2O_BR3-PE-P42-1_S42_001", "DH2O_BR2-PE-P28-1_S28_001", "DH2O_BR1-PE-P14-1_S14_001"]
treatments = ["NU_C_12_BR2-PE-P24-1_S24_001", "NU_C_12_BR3-PE-P38-1_S38_001", "NU_C_12_BR1-PE-P10-1_S10_001"]

for id_sample, coverages in data.items():
if id_sample in solvents:
depth = sample_to_depth[id_sample]
for id_gene, covs in coverages.items():
#             covs = covs/depth
data_solvents[id_gene].append(covs)
if id_sample in treatments:
depth = sample_to_depth[id_sample]
for id_gene, covs in coverages.items():
#             covs = covs/depth
data_treatments[id_gene].append(covs)

for d in [data_solvents, data_treatments]:
for id_gene, covs in d.items():
A = np.stack(covs, axis=0)
d[id_gene] = A.mean(axis=0)

# Get medians for percentils
def f(x):
n = len(x)
p_25 = int(n*0.25)
p_50 = int(n*0.50)
p_75 = int(n*0.75)
p_100 = int(n*1.0)

percentiles = [0, p_25, p_50, p_75, p_100]
output = [0,0,0,0]
for i in range(len(percentiles)-1):
output[i] = np.median(x[percentiles[i]:percentiles[i+1]])
return np.asarray(output)

output_solvents = dict()
output_treatments = dict()

for id_gene, medians in data_solvents.items():
output_solvents[id_gene] = f(medians)

for id_gene, medians in data_treatments.items():
output_treatments[id_gene] = f(medians)

# Organize into DataFrames
df_solvents = pd.DataFrame(output_solvents, index=["0-25", "25-50", "50-75", "75-100"]).T.dropna(how="all", axis=0)
df_treatments = pd.DataFrame(output_treatments, index=["0-25", "25-50", "50-75", "75-100"]).T.dropna(how="all", axis=0)

output_p = dict()
output_diff = dict()
for column in df_solvents.columns:
u = df_solvents[column]
v = df_treatments[column]
output_p[column] = stats.wilcoxon(u, v)[-1]
output_diff[column] = v.median() - u.median()
output_p
# {'0-25': 2.7590531087162958e-117,
#  '25-50': 5.163056594586972e-151,
#  '50-75': 4.137767887055633e-149,
#  '75-100': 5.673772630059623e-134}

df_solvents
#   0-25    25-50   50-75   75-100
# EG11277   0.333333    0.333333    0.333333    0.333333
# EG10998   209.333333  171.000000  250.000000  227.500000
# EG10999   241.000000  125.333333  422.666667  238.833333
# EG11000   257.000000  293.333333  246.333333  204.333333
# G6081 10.833333   61.333333   144.666667  114.333333

differential bed bam rnaseq coverage • 351 views