Question

High ref mismatch rate after liftOver from 23andme hg19 to hg38

0

Entering edit mode

15 months ago

berndmann ▴ 10

I lifted some 23andme files from hg19 to hg38 using the following workflow in R calling samtools,plink and liftOver:

library(tidyverse)

#set working directory to data directory
trio_wd <- str_glue(here::here(),'/trio/K/')

#create file list for raw data
file_list <- str_c(trio_wd,dir(trio_wd)) %>% str_extract('genome.+\\d.txt') %>% str_extract('^(?:(?!admix).)+$') %>%
  unique() %>%
  {.[!is.na(.)]}  %>% str_c(trio_wd,.)

#liftover loop
for (x in file_list){
  #extract sample name
  name = str_extract(x,'(?<=genome_)(.*?)(?=(_v5|_Full))') %>% str_replace('_','')
  #recode to vcf
  call = (str_glue("plink --23file {x} {name} --snps-only just-acgt --recode vcf --out {trio_wd}{name}"))
  system(call)
  #recode to uscf bed format 
  #https://www.researchgate.net/post/How_can_I_create_a_UCSC_BED_file_from_plink_data
  call = str_c("grep -v '^#' ", trio_wd,name,".vcf | awk -F '\t' '{print $1,($2-1),$2,$3}' | sed -e 's/^/chr/' > ", trio_wd,name,".ucsc.bed ")
  system(call)
  #iftover
  #correct chain file : https://groups.google.com/a/soe.ucsc.edu/g/genome/c/BYVXWMpOEjc
  system(str_glue("liftOver  {trio_wd}{name}.ucsc.bed /reference/hg19ToHg38.over.chain.gz {trio_wd}{name}_liftOver.bed  {trio_wd}{name}unlifted.bed"))

  #update positions
#read in the original vcf file
vcf <- read_table(str_glue( "{trio_wd}{name}.vcf"), skip = 30)
#read in the liftOver output
lift <- read_table(str_glue("{trio_wd}{name}_liftOver.bed"),col_names = F)
#update the positions of the original vcf file to the lifted positions
vcf %>% filter(ID %in% lift$X4) %>% mutate(POS = lift$X3,
                                           `#CHROM` = lift$X1) %>% write_tsv(str_glue( "{trio_wd}{name}_lift.vcf"), col_names = T)
#recode to plink
call = str_glue("plink -vcf {trio_wd}{name}_lift.vcf --make-bed --aec --chr 1-22 -out {trio_wd}{name}_lift")
system(call)
}

#create list of lifted files
file_list <- list.files(trio_wd) %>%  str_extract('(.+((k|K)|P))_lift') %>%
  unique %>% {.[!is.na(.)]} %>% str_c(trio_wd,.)

#main loop for data cleaning 
for (x in file_list){
name = str_extract(x,'[A-Za-z]+(?=((k|K)|P)).+')
call = str_glue("plink2 --bfile {trio_wd}{name} --set-all-var-ids '@_#' --make-bed -out {trio_wd}{name}_fixvar")
system(call)
call = str_glue("plink2 --bfile {trio_wd}{name}_fixvar --rm-dup --make-bed -out {trio_wd}{name}_dup")
system(call)
#a few (<10 SNPs): 'duplicate IDs with inconsistent genotype data or variant information', remove if there
if(file.exists(str_glue("{trio_wd}{name}_dup.rmdup.mismatch"))){
  call = str_glue("plink -bfile {trio_wd}{name}_fixvar --exclude {trio_wd}{name}_dup.rmdup.mismatch --make-bed -out {trio_wd}{name}_dup_mis")
  system(call)
  call = str_glue("plink2 --bfile {trio_wd}{name}_dup_mis --set-all-var-ids '@_#' --make-bed -out {trio_wd}{name}_dup_mis_id")
  system(call)
  call = str_glue("plink2 --bfile {trio_wd}{name}_dup_mis_id --rm-dup --make-bed -out {trio_wd}{name}_dup")
  system(call)
}
#ensure that there are no positions with same allele coding
call = str_c("awk '$5==$6 {print $2}' " ,trio_wd, name, "_dup.bim >", trio_wd, name, "_dup_noalle.txt")
system(call)
call = str_glue("plink -bfile {trio_wd}{name}_dup --exclude {trio_wd}{name}_dup_noalle.txt  --make-bed -out {trio_wd}{name}_dup_fixal")
system(call)
}


file_list <- list.files(trio_wd) %>%  str_extract('.+_dup_fixal.bed$') %>%
  str_remove('.bed')  %>%
    {.[!is.na(.)]} %>% str_c(trio_wd,.)
write_lines(file_list,str_glue('{trio_wd}merge.txt'))

# try to merge to single file
system(str_glue("plink --merge-list {trio_wd}merge.txt --make-bed --out {trio_wd}K"))

for(x in file_list){
  system(str_glue("plink --bfile {x} --exclude {trio_wd}K-merge.missnp --make-bed --out {str_replace(x, 'fixal', 'subset')}"))
}

file_list <- list.files(trio_wd) %>%  str_extract('.+_dup_subset.bed') %>%
  str_remove('.bed')  %>%
  {.[!is.na(.)]} %>% str_c(trio_wd,.)
write_lines(file_list,str_glue('{trio_wd}merge.txt'))

# merge to single file
system(str_glue("plink --merge-list {trio_wd}merge.txt --make-bed --out {trio_wd}K"))

If do some sanity check afterwards to verify that the liftover process went well I tried the fixref plugin from bcftools to get an overview of possible ref mismatches:

#use some filtering to improve genotype rates
     call <- str_glue('plink --bfile {trio_wd}K  --snps-only just-acgt -geno 0.05 --recode vcf bgz --out {trio_wd}K ')
      system(call)
#index the compressed vcf file
      call = str_glue('bcftools index -f {trio_wd}K.vcf.gz')
      system(call)
#call fixref
      call = str_glue("bcftools +fixref {trio_wd}K.vcf.gz -- -f reference/GRCh38_full_analysis_set_plus_decoy_hla_modified.fa")
      system(call)

The results can be seen in the following screenshot:

fixref plugin result

This clearly looks to me that something went wrong but I can't figure out what the possible problem here could be. Do I use an incorrect hg38 reference file? Did I use some wrong filter? Is the liftOver procedure incorrect?

liftOver plink 23andme bcftools-fixref • 1.6k views

ADD COMMENT • link updated 10 months ago by Ram 43k • written 15 months ago by berndmann ▴ 10

0

Entering edit mode

make sure you are not 1 off Cheat Sheet For One-Based Vs Zero-Based Coordinate Systems

ADD REPLY • link 15 months ago by Jeremy Leipzig 22k

0

Entering edit mode

So when I recode from plink format to vcf I need to adjust my positions using e.g this since bed is 0-based and vcf is 1-based?

ADD REPLY • link 15 months ago by berndmann ▴ 10

0

Entering edit mode

i think plink knows what vcf is, and ($2-1) looks ok, but maybe something downstream

ADD REPLY • link 15 months ago by Jeremy Leipzig 22k

0

Entering edit mode

plink 1.9 merge will sometimes swap ref/alt alleles when they were previously set correctly, so it is necessary to postprocess its output in some manner.

ADD REPLY • link 15 months ago by chrchang523 10k

score 1 · Answer 1 · 2023-01-17

Notice that when you use BCFtools +fixref it will swap the genotypes for SNPs, but not for indels (if there are any in your 23andMe file). Also, it will not do the right thing in those few cases where neither of the original alleles matches the new GRCh38 reference, as it cannot add a new allele. You can use BCFtools and my new liftover implementation here (binaries available here). After installing the binaries and the required resources with the following commands:

wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz
gunzip human_g1k_v37.fasta.gz
samtools faidx human_g1k_v37.fasta
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
gunzip hg38.fa.gz
samtools faidx hg38.fa
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz

You can get this done very quickly with this command:

zcat genome_XXXXXX_XXXXXX_v5_Full_yyyymmddhhmmss.zip | \
bcftools convert -Ou --tsv2vcf - --columns ID,CHROM,POS,AA --fasta-ref human_g1k_v37.fasta --samples XXXXXX_XXXXXX | \
bcftools +liftover -Ou -- -s human_g1k_v37.fasta -f hg38.fa -c hg19ToHg38.over.chain.gz | \
bcftools sort -Oz -o genome_XXXXXX_XXXXXX_v5_Full_yyyymmddhhmmss.hg38.vcf.gz
bcftools index --force --tbi genome_XXXXXX_XXXXXX_v5_Full_yyyymmddhhmmss.hg38.vcf.gz

The command output will look something like this:

Writing to /tmp/bcftools.XXXXXX
Warning: source contig chrM has length 16569 in the VCF and length 16571 in the chain file
Rows total:     546556
Rows skipped:   132
Missing GTs:    3318
Hom RR:     265504
Het RA:     167683
Hom AA:     109918
Het AA:     1
Lines   total/swapped/reference added/rejected: 546424/1109/602/121
Merging 1 temporary files
Cleaning
Done

Which is a mix of the outputs of BCFtools commands convert/+liftover/sort. As shown, for a few hundred SNPs it is actually necessary to add a new reference allele