|
from __future__ import print_function |
|
from collections import Counter |
|
import toolshed as ts |
|
import itertools as it |
|
import click |
|
import sys |
|
import re |
|
import array |
|
from math import log10 |
|
from pyper import R as Rpyper |
|
|
|
R = Rpyper(use_numpy=False) |
|
R.use_dict = True |
|
R('source("src/hapskat.R")') |
|
|
|
@click.command() |
|
@click.argument('vcf', type=click.Path(exists=True)) |
|
@click.argument('cov', type=click.Path(exists=True)) |
|
@click.argument('formula') |
|
@click.option('--min-snps', default=4, help="don't test a block unless it" |
|
" has at least this many variants.") |
|
@click.option('--max-missing', default=10, help="exclude variants with more" |
|
" missing variants than this.") |
|
def hapskat(vcf, cov, formula, min_snps, max_missing): |
|
|
|
fmt = "{chrom}\t{start}\t{end}\t{chrom}:{start}-{end}\t\t{f_p_value}\t{p_value}\t{n_tested}\t{haplotypes}\t{rare_haplotypes}" |
|
block_iter = read_blocks(vcf, min_snps, max_missing) |
|
header = next(block_iter) |
|
samples = header[9:] |
|
print(R("null.obj <- hap.skat.null(%s, '%s')" % (formula, cov)), |
|
file=sys.stderr) |
|
R('wgts <- NULL') |
|
print("#" + fmt.replace("}", "").replace("{", "")) |
|
for i, block in enumerate(block_iter): |
|
Z, rare_haps = merge_rare(getZ(block, header)) |
|
assert [r[0] for r in Z[1:]] == samples, ("bad order") |
|
# remove column and row labels |
|
Zsub = [row[1:] for row in Z[1:]] |
|
send_array(Zsub) |
|
#R['wgts'] = wgts |
|
r = R("res = NA; Z = read.bin('/tmp/t.bin'); res = hap.skat(Z, null.obj, y, wgts)").split("\n") |
|
r = [x.strip() for x in r if x.strip() and not x.startswith('try')] |
|
if r: |
|
print("\n".join(r), file=sys.stderr) |
|
res = R['res'] |
|
locs = get_locs(block) |
|
if len(locs) != len(set(locs)): |
|
print(len(locs), len(set(locs)), file=sys.stderr) |
|
print(res, file=sys.stderr) |
|
res['chrom'] = locs[0].split(":")[0] |
|
res['start'] = locs[0].split(":")[1] |
|
res['end'] = locs[-1].split(":")[1] |
|
res['haplotypes'] = "|".join(Z[0][i] for i in range(1, len(Z[0])) if |
|
not Z[0][i].startswith("rare")) |
|
res['rare_haplotypes'] = "|".join(rare_haps) or "NA" |
|
print(fmt.format(**res)) |
|
|
|
print(i + 1, "tests", file=sys.stderr) |
|
|
|
def merge_rare(Z): |
|
# Z is row and column headers is of shape n_samples * n_haplotypes. |
|
|
|
# use 1/sqrt(2 * n_samples) as cutoff--from SKAT_CommonRare. |
|
rare_cutoff = 1.0 / (2.0 * len(Z)) ** 0.5 |
|
col_freqs = [sum(row[i] for row in Z[1:]) / float(len(Z) - 1) for i in range(1, len(Z[0]))] |
|
rare_idxs = [i for i, f in enumerate(col_freqs) if f < rare_cutoff] |
|
# since we know that Z has the most common columns first, we can simplify |
|
# the stuff below to just use the first rare column |
|
# just check that we don't get, e.g. 2,4,5 instaed of 2,3,4,5 |
|
if True: #rare_idxs == []: |
|
return Z, [] |
|
assert rare_idxs == range(min(rare_idxs), max(rare_idxs) + 1) |
|
|
|
Znew = [[Z[0][0]] + [Z[0][i + 1] for i in range(rare_idxs[0])] + ['rare_sum']] |
|
for row in Z[1:]: |
|
# the row label and the common haplotypes. |
|
new_row = [row[0]] + [row[i + 1] for i in range(rare_idxs[0])] |
|
# the merged (summed) rare haplotypes go to a single column |
|
new_row.append(min(2, sum(row[i + 1] for i in rare_idxs))) |
|
Znew.append(new_row) |
|
|
|
# return merged data along with the list of rare haplotypes. |
|
return Znew, [Z[0][i + 1] for i in rare_idxs] |
|
|
|
def getZ(block, header): |
|
sample_haps = [] |
|
sample_haps.extend((get_phases(block, i) for i in range(9, len(header)))) |
|
c = Counter() |
|
for row in sample_haps: |
|
c.update(row) |
|
# TODO: handle haplotypes containing 'X' (genotype of NA within hap) |
|
|
|
Z = [['sample'] + ["%s" % x[0] for x in c.most_common()]] |
|
for i, row in enumerate(sample_haps): |
|
# e.g. if it has same hap from both parents then |
|
# it will have a score of Z for that haplotype |
|
Z.append([header[9 + i]] + [0.5 * row.count(h) for h in Z[0][1:]]) |
|
return Z |
|
|
|
def tryint(x): |
|
try: return int(x) |
|
except ValueError: return -1 |
|
|
|
def get_locs(block): |
|
return ["%s:%i" % (b[0], int(b[1])) for b in block] |
|
|
|
def get_phases(block, idx, sep = re.compile("\||/")): |
|
""" |
|
use the 0|1 in the genotype column to look up the ref/alt bases |
|
""" |
|
refalts = [b[3].split(",") + b[4].split(",") + ["X"] for b in block] |
|
genosM = [tryint(sep.split(v[idx].split(":")[0])[0]) for v in block] |
|
genosP = [tryint(sep.split(v[idx].split(":")[0])[-1]) for v in block] |
|
return "_".join([refalts[i][gm] for i, gm in enumerate(genosM)]),\ |
|
"_".join([refalts[i][gp] for i, gp in enumerate(genosP)]) |
|
|
|
def is_phased(toks): |
|
return any("|" in t.split(":")[0] for t in toks[9:]) |
|
|
|
def n_missing(toks): |
|
return sum(t in (".", "./.") for t in toks[9:]) |
|
|
|
def read_blocks(vcf, min_block_size, max_missing_samples): |
|
""" |
|
A haplotype block always contain "/", then the following lines in the |
|
same block all contain "|". A line with "/" indicates the start of a |
|
new block. This function takes a VCF and yields blocks |
|
""" |
|
|
|
fh, header = ts.reader(vcf, header=False), [None] |
|
while header[0] != "#CHROM": |
|
header = next(fh) |
|
header[0] = "CHROM" |
|
yield header |
|
|
|
last_un = None |
|
for phased, block in it.groupby(ts.reader(fh, header=False), is_phased): |
|
if not phased: |
|
last_un = list(block)[-1] |
|
continue |
|
assert last_un |
|
block = [toks for toks in it.chain([last_un], block) if n_missing(toks) <= |
|
max_missing_samples] |
|
# subtract 1 for row name. |
|
if len(block) - 1 < min_block_size: continue |
|
last_un = None |
|
yield block |
|
|
|
def send_array(arr, fh=open('/tmp/t.bin', 'w')): |
|
fh.seek(0) |
|
# send shape as int |
|
array.array('l', [len(arr), len(arr[0])]).tofile(fh) |
|
# send the data as double |
|
array.array('d', [i for row in arr for i in row]).tofile(fh) |
|
fh.flush() |
|
|
|
if __name__ == "__main__": |
|
hapskat() |
Brent -
Thank you very much for sharing your code - very kind of you. I will accept this answer after a little time elapses, I do not want to discourage other answers.
Note that you have the close parenthesis as part of the hyperlink, causing the link to fail without modification.