Making use of phased haplotypes in rare variant burden testing
1
5
Entering edit mode
10.9 years ago

Hello all,

I have a GWAS cohort and a WGS cohort that is a subset of the GWAS cohort.

I have identified all the lead SNPs from the GWAS data (larger n) and have defined linkage disequilibrium around each lead SNP in the study.

Further, I have phased the WGS genomes. As such, I can determine the variants that are in phase, or not in phase, with the lead SNPs in the study.

Right now, it is my intention to do this by parsing the VCF files, then reformatting into SKAT-O format. However, my question is, are there existing workflows that port phased genomes directly into any rare variant association tool, or, is one or more of these steps DIY?

For clarity, what I mean by rare variant association tools is roughly summarized by the Lee / Lin review paper in AJHG

NGS Phasing Burden GWAS Haplotypes • 3.2k views
ADD COMMENT
4
Entering edit mode
10.9 years ago
brentp 24k

I wrote a script to do exactly this:

from __future__ import print_function
from collections import Counter
import toolshed as ts
import itertools as it
import click
import sys
import re
import array
from math import log10
from pyper import R as Rpyper
R = Rpyper(use_numpy=False)
R.use_dict = True
R('source("src/hapskat.R")')
@click.command()
@click.argument('vcf', type=click.Path(exists=True))
@click.argument('cov', type=click.Path(exists=True))
@click.argument('formula')
@click.option('--min-snps', default=4, help="don't test a block unless it"
" has at least this many variants.")
@click.option('--max-missing', default=10, help="exclude variants with more"
" missing variants than this.")
def hapskat(vcf, cov, formula, min_snps, max_missing):
fmt = "{chrom}\t{start}\t{end}\t{chrom}:{start}-{end}\t\t{f_p_value}\t{p_value}\t{n_tested}\t{haplotypes}\t{rare_haplotypes}"
block_iter = read_blocks(vcf, min_snps, max_missing)
header = next(block_iter)
samples = header[9:]
print(R("null.obj <- hap.skat.null(%s, '%s')" % (formula, cov)),
file=sys.stderr)
R('wgts <- NULL')
print("#" + fmt.replace("}", "").replace("{", ""))
for i, block in enumerate(block_iter):
Z, rare_haps = merge_rare(getZ(block, header))
assert [r[0] for r in Z[1:]] == samples, ("bad order")
# remove column and row labels
Zsub = [row[1:] for row in Z[1:]]
send_array(Zsub)
#R['wgts'] = wgts
r = R("res = NA; Z = read.bin('/tmp/t.bin'); res = hap.skat(Z, null.obj, y, wgts)").split("\n")
r = [x.strip() for x in r if x.strip() and not x.startswith('try')]
if r:
print("\n".join(r), file=sys.stderr)
res = R['res']
locs = get_locs(block)
if len(locs) != len(set(locs)):
print(len(locs), len(set(locs)), file=sys.stderr)
print(res, file=sys.stderr)
res['chrom'] = locs[0].split(":")[0]
res['start'] = locs[0].split(":")[1]
res['end'] = locs[-1].split(":")[1]
res['haplotypes'] = "|".join(Z[0][i] for i in range(1, len(Z[0])) if
not Z[0][i].startswith("rare"))
res['rare_haplotypes'] = "|".join(rare_haps) or "NA"
print(fmt.format(**res))
print(i + 1, "tests", file=sys.stderr)
def merge_rare(Z):
# Z is row and column headers is of shape n_samples * n_haplotypes.
# use 1/sqrt(2 * n_samples) as cutoff--from SKAT_CommonRare.
rare_cutoff = 1.0 / (2.0 * len(Z)) ** 0.5
col_freqs = [sum(row[i] for row in Z[1:]) / float(len(Z) - 1) for i in range(1, len(Z[0]))]
rare_idxs = [i for i, f in enumerate(col_freqs) if f < rare_cutoff]
# since we know that Z has the most common columns first, we can simplify
# the stuff below to just use the first rare column
# just check that we don't get, e.g. 2,4,5 instaed of 2,3,4,5
if True: #rare_idxs == []:
return Z, []
assert rare_idxs == range(min(rare_idxs), max(rare_idxs) + 1)
Znew = [[Z[0][0]] + [Z[0][i + 1] for i in range(rare_idxs[0])] + ['rare_sum']]
for row in Z[1:]:
# the row label and the common haplotypes.
new_row = [row[0]] + [row[i + 1] for i in range(rare_idxs[0])]
# the merged (summed) rare haplotypes go to a single column
new_row.append(min(2, sum(row[i + 1] for i in rare_idxs)))
Znew.append(new_row)
# return merged data along with the list of rare haplotypes.
return Znew, [Z[0][i + 1] for i in rare_idxs]
def getZ(block, header):
sample_haps = []
sample_haps.extend((get_phases(block, i) for i in range(9, len(header))))
c = Counter()
for row in sample_haps:
c.update(row)
# TODO: handle haplotypes containing 'X' (genotype of NA within hap)
Z = [['sample'] + ["%s" % x[0] for x in c.most_common()]]
for i, row in enumerate(sample_haps):
# e.g. if it has same hap from both parents then
# it will have a score of Z for that haplotype
Z.append([header[9 + i]] + [0.5 * row.count(h) for h in Z[0][1:]])
return Z
def tryint(x):
try: return int(x)
except ValueError: return -1
def get_locs(block):
return ["%s:%i" % (b[0], int(b[1])) for b in block]
def get_phases(block, idx, sep = re.compile("\||/")):
"""
use the 0|1 in the genotype column to look up the ref/alt bases
"""
refalts = [b[3].split(",") + b[4].split(",") + ["X"] for b in block]
genosM = [tryint(sep.split(v[idx].split(":")[0])[0]) for v in block]
genosP = [tryint(sep.split(v[idx].split(":")[0])[-1]) for v in block]
return "_".join([refalts[i][gm] for i, gm in enumerate(genosM)]),\
"_".join([refalts[i][gp] for i, gp in enumerate(genosP)])
def is_phased(toks):
return any("|" in t.split(":")[0] for t in toks[9:])
def n_missing(toks):
return sum(t in (".", "./.") for t in toks[9:])
def read_blocks(vcf, min_block_size, max_missing_samples):
"""
A haplotype block always contain "/", then the following lines in the
same block all contain "|". A line with "/" indicates the start of a
new block. This function takes a VCF and yields blocks
"""
fh, header = ts.reader(vcf, header=False), [None]
while header[0] != "#CHROM":
header = next(fh)
header[0] = "CHROM"
yield header
last_un = None
for phased, block in it.groupby(ts.reader(fh, header=False), is_phased):
if not phased:
last_un = list(block)[-1]
continue
assert last_un
block = [toks for toks in it.chain([last_un], block) if n_missing(toks) <=
max_missing_samples]
# subtract 1 for row name.
if len(block) - 1 < min_block_size: continue
last_un = None
yield block
def send_array(arr, fh=open('/tmp/t.bin', 'w')):
fh.seek(0)
# send shape as int
array.array('l', [len(arr), len(arr[0])]).tofile(fh)
# send the data as double
array.array('d', [i for row in arr for i in row]).tofile(fh)
fh.flush()
if __name__ == "__main__":
hapskat()
view raw hapskat.py hosted with ❤ by GitHub
library(SKAT)
set.seed(1)
hap.skat.null = function(formula, covs, ...){
covs = read.delim(covs)
assign("y", covs[[as.character(formula[[2]])]], envir = .GlobalEnv)
m = SKAT_Null_Model(formula, data=covs, out_type="D")
return(m)
}
hap.skat = function(Z, null.obj, y, wgts=NULL, ...){
r = try(SKAT(Z, null.obj, weights=wgts, is_dosage=TRUE,
kernel=ifelse(is.null(wgts), "linear", "linear.weighted"), method="liu"))
if(!inherits(r, "try-error")){
res = list(p_value=r$p.value, rho=r$param$rho_est,
n_tested=r$param$n.marker.test,
p_value_resampling=r$p.value.resampling)
} else {
res = list(p_value=NaN, rho=NaN, n_tested=NaN, p_value_resampling=NaN)
}
if("X1" %in% names(null.obj)){
res$f_p_value = ftest(null.obj$X1, Z, y)
} else {
res$f_p_value = ftest(null.obj$re1$X1, Z, y)
}
res
}
ftest = function(X1, Z, y){
# TODO: convert into formula so we can get f.test
X1 = data.frame(X1)
colnames(Z) = paste0("Z", 1:ncol(Z))
colnames(X1)[1] = "intercept"
xnames = colnames(X1)
X1$yyy = y
m0 = as.formula(sprintf("yyy ~ 0 + %s", paste(xnames, collapse=" + ")))
fit0 = glm(m0, data=X1, family=binomial(link=logit))
mz = as.formula(sprintf("yyy ~ 0 + %s + %s",
paste(xnames, collapse=" + "),
paste(colnames(Z), collapse=" + ")))
X2 = cbind(X1, Z)
fitz = glm(mz, data=X2, family=binomial(link=logit))
r = anova(fit0, fitz, test="Chisq")
return(r[2, "Pr(>Chi)"])
}
read.bin = function(bin.file){
conn = file(bin.file, 'rb')
mdims = readBin(conn, what=integer(), size=8, n=2)
nrow = mdims[1]
ncol = mdims[2]
dat = readBin(conn, what=numeric(), size=8, n=nrow * ncol)
close(conn)
matrix(dat, nrow=nrow, ncol=ncol, byrow=TRUE)
}
view raw hapskat.R hosted with ❤ by GitHub

It parses the VCF in python and sends the haplotype blocks to R using pyper. It can be a start for the parsing/creating haplotypes or you can use it as-is to get p-values from SKAT.

There are a lot of methods for testing haplotypes, but I think this poster does a nice job comparing.

ADD COMMENT
0
Entering edit mode

Brent -

Thank you very much for sharing your code - very kind of you. I will accept this answer after a little time elapses, I do not want to discourage other answers.

Note that you have the close parenthesis as part of the hyperlink, causing the link to fail without modification.

ADD REPLY

Login before adding your answer.

Traffic: 1252 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6