Making use of phased haplotypes in rare variant burden testing

Entering edit mode

10.9 years ago

fadsfadfdsafdasfdsa ▴ 290

Hello all,

I have a GWAS cohort and a WGS cohort that is a subset of the GWAS cohort.

I have identified all the lead SNPs from the GWAS data (larger n) and have defined linkage disequilibrium around each lead SNP in the study.

Further, I have phased the WGS genomes. As such, I can determine the variants that are in phase, or not in phase, with the lead SNPs in the study.

Right now, it is my intention to do this by parsing the VCF files, then reformatting into SKAT-O format. However, my question is, are there existing workflows that port phased genomes directly into any rare variant association tool, or, is one or more of these steps DIY?

For clarity, what I mean by rare variant association tools is roughly summarized by the Lee / Lin review paper in AJHG

NGS Phasing Burden GWAS Haplotypes • 3.2k views

ADD COMMENT • link updated 3.6 years ago by Ram 45k • written 10.9 years ago by fadsfadfdsafdasfdsa ▴ 290

Entering edit mode

10.9 years ago

brentp 24k

I wrote a script to do exactly this:

	from __future__ import print_function
	from collections import Counter
	import toolshed as ts
	import itertools as it
	import click
	import sys
	import re
	import array
	from math import log10
	from pyper import R as Rpyper

	R = Rpyper(use_numpy=False)
	R.use_dict = True
	R('source("src/hapskat.R")')

	@click.command()
	@click.argument('vcf', type=click.Path(exists=True))
	@click.argument('cov', type=click.Path(exists=True))
	@click.argument('formula')
	@click.option('--min-snps', default=4, help="don't test a block unless it"
	" has at least this many variants.")
	@click.option('--max-missing', default=10, help="exclude variants with more"
	" missing variants than this.")
	def hapskat(vcf, cov, formula, min_snps, max_missing):

	fmt = "{chrom}\t{start}\t{end}\t{chrom}:{start}-{end}\t\t{f_p_value}\t{p_value}\t{n_tested}\t{haplotypes}\t{rare_haplotypes}"
	block_iter = read_blocks(vcf, min_snps, max_missing)
	header = next(block_iter)
	samples = header[9:]
	print(R("null.obj <- hap.skat.null(%s, '%s')" % (formula, cov)),
	file=sys.stderr)
	R('wgts <- NULL')
	print("#" + fmt.replace("}", "").replace("{", ""))
	for i, block in enumerate(block_iter):
	Z, rare_haps = merge_rare(getZ(block, header))
	assert [r[0] for r in Z[1:]] == samples, ("bad order")
	# remove column and row labels
	Zsub = [row[1:] for row in Z[1:]]
	send_array(Zsub)
	#R['wgts'] = wgts
	r = R("res = NA; Z = read.bin('/tmp/t.bin'); res = hap.skat(Z, null.obj, y, wgts)").split("\n")
	r = [x.strip() for x in r if x.strip() and not x.startswith('try')]
	if r:
	print("\n".join(r), file=sys.stderr)
	res = R['res']
	locs = get_locs(block)
	if len(locs) != len(set(locs)):
	print(len(locs), len(set(locs)), file=sys.stderr)
	print(res, file=sys.stderr)
	res['chrom'] = locs[0].split(":")[0]
	res['start'] = locs[0].split(":")[1]
	res['end'] = locs[-1].split(":")[1]
	res['haplotypes'] = "\|".join(Z[0][i] for i in range(1, len(Z[0])) if
	not Z[0][i].startswith("rare"))
	res['rare_haplotypes'] = "\|".join(rare_haps) or "NA"
	print(fmt.format(**res))

	print(i + 1, "tests", file=sys.stderr)

	def merge_rare(Z):
	# Z is row and column headers is of shape n_samples * n_haplotypes.

	# use 1/sqrt(2 * n_samples) as cutoff--from SKAT_CommonRare.
	rare_cutoff = 1.0 / (2.0 * len(Z)) ** 0.5
	col_freqs = [sum(row[i] for row in Z[1:]) / float(len(Z) - 1) for i in range(1, len(Z[0]))]
	rare_idxs = [i for i, f in enumerate(col_freqs) if f < rare_cutoff]
	# since we know that Z has the most common columns first, we can simplify
	# the stuff below to just use the first rare column
	# just check that we don't get, e.g. 2,4,5 instaed of 2,3,4,5
	if True: #rare_idxs == []:
	return Z, []
	assert rare_idxs == range(min(rare_idxs), max(rare_idxs) + 1)

	Znew = [[Z[0][0]] + [Z[0][i + 1] for i in range(rare_idxs[0])] + ['rare_sum']]
	for row in Z[1:]:
	# the row label and the common haplotypes.
	new_row = [row[0]] + [row[i + 1] for i in range(rare_idxs[0])]
	# the merged (summed) rare haplotypes go to a single column
	new_row.append(min(2, sum(row[i + 1] for i in rare_idxs)))
	Znew.append(new_row)

	# return merged data along with the list of rare haplotypes.
	return Znew, [Z[0][i + 1] for i in rare_idxs]

	def getZ(block, header):
	sample_haps = []
	sample_haps.extend((get_phases(block, i) for i in range(9, len(header))))
	c = Counter()
	for row in sample_haps:
	c.update(row)
	# TODO: handle haplotypes containing 'X' (genotype of NA within hap)

	Z = [['sample'] + ["%s" % x[0] for x in c.most_common()]]
	for i, row in enumerate(sample_haps):
	# e.g. if it has same hap from both parents then
	# it will have a score of Z for that haplotype
	Z.append([header[9 + i]] + [0.5 * row.count(h) for h in Z[0][1:]])
	return Z

	def tryint(x):
	try: return int(x)
	except ValueError: return -1

	def get_locs(block):
	return ["%s:%i" % (b[0], int(b[1])) for b in block]

	def get_phases(block, idx, sep = re.compile("\\|\|/")):
	"""
	use the 0\|1 in the genotype column to look up the ref/alt bases
	"""
	refalts = [b[3].split(",") + b[4].split(",") + ["X"] for b in block]
	genosM = [tryint(sep.split(v[idx].split(":")[0])[0]) for v in block]
	genosP = [tryint(sep.split(v[idx].split(":")[0])[-1]) for v in block]
	return "_".join([refalts[i][gm] for i, gm in enumerate(genosM)]),\
	"_".join([refalts[i][gp] for i, gp in enumerate(genosP)])

	def is_phased(toks):
	return any("\|" in t.split(":")[0] for t in toks[9:])

	def n_missing(toks):
	return sum(t in (".", "./.") for t in toks[9:])

	def read_blocks(vcf, min_block_size, max_missing_samples):
	"""
	A haplotype block always contain "/", then the following lines in the
	same block all contain "\|". A line with "/" indicates the start of a
	new block. This function takes a VCF and yields blocks
	"""

	fh, header = ts.reader(vcf, header=False), [None]
	while header[0] != "#CHROM":
	header = next(fh)
	header[0] = "CHROM"
	yield header

	last_un = None
	for phased, block in it.groupby(ts.reader(fh, header=False), is_phased):
	if not phased:
	last_un = list(block)[-1]
	continue
	assert last_un
	block = [toks for toks in it.chain([last_un], block) if n_missing(toks) <=
	max_missing_samples]
	# subtract 1 for row name.
	if len(block) - 1 < min_block_size: continue
	last_un = None
	yield block

	def send_array(arr, fh=open('/tmp/t.bin', 'w')):
	fh.seek(0)
	# send shape as int
	array.array('l', [len(arr), len(arr[0])]).tofile(fh)
	# send the data as double
	array.array('d', [i for row in arr for i in row]).tofile(fh)
	fh.flush()

	if __name__ == "__main__":
	hapskat()

view raw hapskat.py hosted with ❤ by GitHub

	library(SKAT)
	set.seed(1)

	hap.skat.null = function(formula, covs, ...){
	covs = read.delim(covs)
	assign("y", covs[[as.character(formula[[2]])]], envir = .GlobalEnv)
	m = SKAT_Null_Model(formula, data=covs, out_type="D")
	return(m)
	}

	hap.skat = function(Z, null.obj, y, wgts=NULL, ...){

	r = try(SKAT(Z, null.obj, weights=wgts, is_dosage=TRUE,
	kernel=ifelse(is.null(wgts), "linear", "linear.weighted"), method="liu"))
	if(!inherits(r, "try-error")){
	res = list(p_value=r$p.value, rho=r$param$rho_est,
	n_tested=r$param$n.marker.test,
	p_value_resampling=r$p.value.resampling)
	} else {
	res = list(p_value=NaN, rho=NaN, n_tested=NaN, p_value_resampling=NaN)
	}
	if("X1" %in% names(null.obj)){
	res$f_p_value = ftest(null.obj$X1, Z, y)
	} else {
	res$f_p_value = ftest(null.obj$re1$X1, Z, y)
	}
	res
	}


	ftest = function(X1, Z, y){
	# TODO: convert into formula so we can get f.test
	X1 = data.frame(X1)
	colnames(Z) = paste0("Z", 1:ncol(Z))
	colnames(X1)[1] = "intercept"
	xnames = colnames(X1)
	X1$yyy = y
	m0 = as.formula(sprintf("yyy ~ 0 + %s", paste(xnames, collapse=" + ")))
	fit0 = glm(m0, data=X1, family=binomial(link=logit))

	mz = as.formula(sprintf("yyy ~ 0 + %s + %s",
	paste(xnames, collapse=" + "),
	paste(colnames(Z), collapse=" + ")))
	X2 = cbind(X1, Z)
	fitz = glm(mz, data=X2, family=binomial(link=logit))
	r = anova(fit0, fitz, test="Chisq")
	return(r[2, "Pr(>Chi)"])
	}

	read.bin = function(bin.file){
	conn = file(bin.file, 'rb')
	mdims = readBin(conn, what=integer(), size=8, n=2)
	nrow = mdims[1]
	ncol = mdims[2]
	dat = readBin(conn, what=numeric(), size=8, n=nrow * ncol)
	close(conn)
	matrix(dat, nrow=nrow, ncol=ncol, byrow=TRUE)
	}

view raw hapskat.R hosted with ❤ by GitHub

It parses the VCF in python and sends the haplotype blocks to R using pyper. It can be a start for the parsing/creating haplotypes or you can use it as-is to get p-values from SKAT.

There are a lot of methods for testing haplotypes, but I think this poster does a nice job comparing.

ADD COMMENT • link updated 3.6 years ago by Ram 45k • written 10.9 years ago by brentp 24k

Entering edit mode

Brent -

Thank you very much for sharing your code - very kind of you. I will accept this answer after a little time elapses, I do not want to discourage other answers.

Note that you have the close parenthesis as part of the hyperlink, causing the link to fail without modification.

ADD REPLY • link updated 3.6 years ago by Ram 45k • written 10.9 years ago by fadsfadfdsafdasfdsa ▴ 290