Filter SNPs within VCF / .ChromR object using a custom list of mouse ensembl ids.
2
0
Entering edit mode
2.2 years ago
Kyle ▴ 10

Hi,

I have some mouse strain specific vcf files from ftp://ftp-mouse.sanger.ac.uk/. I've aligned them to the GRCm38_68 reference genome and created chromR objects. Is there a simple way to filter these SNPs (either unlaigned as vcfs or as the Chrom.R objects) using a custom list of mouse emsemble gene ids?

Code that I'm trying to use is below:

# Packages
#install.packages("vcfR")
#BiocManager::install("org.Mm.eg.db")
#BiocManager::install("TxDb.Mmusculus.UCSC.mm10.knownGene")

library(vcfR)
library(org.Mm.eg.db)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)

# Load vcfs and  reference seq
bl6_mouse <- read.vcfR("C57BL_6NJ.mgp.v5.snps.dbSNP142.vcf.gz", verbose = FALSE )
C3H_mouse <- read.vcfR("C3H_HeH.mgp.v5.snps.dbSNP142.vcf.gz", verbose = FALSE )
C57BL6J_ref <- ape::read.dna("GRCm38_68.fa", format="fasta")

# TF list from embflow
embflow_tfs <- read.table("all_tfs_list.txt", sep='\t', header = T)

# convert to ensemble mouse IDs
embflow_ensmu <- mapIds(org.Mm.eg.db, embflow_tfs$x,"ENSEMBL","SYMBOL")


# BL6 SNPs
BL6_chrom <- create.chromR(name='Bl6_Supercontig', vcf=bl6_mouse, seq=C57BL6J_ref)
BL6_chrom <- masker(BL6_chrom, min_DP = 0, max_DP = 450)
BL6_chrom <- proc.chromR(BL6_chrom, verbose=TRUE)
chromoqc(BL6_chrom, dp.alpha=20)

# C3H SNPs
C3H_chrom <- create.chromR(name='C3H_Supercontig', vcf=C3H_mouse, seq=C57BL6J_ref)
C3H_chrom <- masker(C3H_chrom, min_DP = 0, max_DP = 27)
C3H_chrom <- proc.chromR(C3H_chrom, verbose=TRUE)

chromoqc(C3H_chrom, dp.alpha=20)

Kind Regards,
Kyle Drover

vcfR R • 655 views
ADD COMMENT
1
Entering edit mode
2.2 years ago
abedkurdi10 ▴ 190

Dear Kyle,

You can use bcftools tool in command line.

ADD COMMENT
0
Entering edit mode
2.2 years ago
Kyle ▴ 10

So just for anybody else working on data from the Mouse Genomes Project (ftp://ftp-mouse.sanger.ac.uk/) - there is an obscure annotation file within the web data folder.

From there you can filter using VariantAnnotation:locateVaraits function.

Code below (assumes annotations file is a .csv with the mouse gene symbols as a column called 'Symbol'):

library(VariantAnnotation)
library(org.Mm.eg.db)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)

load_data <- function(control_vcf_file_path, test_vcf_file_path){
   #' load test and control SNPs and remove like SNPs from the dataset
   ctrl_vcf <- readVcf(open(VcfFile(file = control_vcf_file_path,
                               index = "mouse-snps-all.annots.vcf.gz.tbi")))

   test_vcf <- readVcf(open(VcfFile(file = test_vcf_file_path,
                               index = "mouse-snps-all.annots.vcf.gz.tbi")))
   # Filter Control SNPs from the test SNP dataset
   # This also removes duplicate SNPs
   vcf <- test_vcf[!ctrl_vcf@rowRanges@ranges %in% test_vcf@rowRanges@ranges]
   return(vcf)
   }


get_annos <- function(csv_file_path){
    #' Finds the appropriate column of gene IDs, 
    #' extracts the gene_IDs and converts them to Ensemble Mouse IDs

    raw_anno <- read.csv(csv_file_path, stringsAsFactors=F)

    entrezIDs <- mapIds(org.Mm.eg.db, raw_anno$Symbol,"ENTREZID","SYMBOL")

    return(entrezIDs)
}

SNP_filter <- function(vcf, entrez_ID_list, var_type){
     #load known all mouse genes
     txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene

     #match chromosome names to txdb
     seqlevels(vcf) <- c("chr1", "chr10", "chr11", "chr12",
                  "chr13", "chr14", "chr15", "chr16",
                  "chr17", "chr18", "chr19", "chr2",
                  "chr3", "chr4", "chr5", "chr6", 
                  "chr7", "chr8", "chr9","chrM", "chrX", 
                  "chrY")

     # Locate Variants
     loc <- locateVariants(rowRanges(vcf), txdb, var_type)

     # filter by mouse ensembl gene IDs
     loc[mcols(loc)$GENEID %in% na.omit(entrez_ID_list)]

 }

 SNP_data <- load_data("C57BL_6NJ.mgp.v5.snps.dbSNP142.vcf.gz","C3H_HeH.mgp.v5.snps.dbSNP142.vcf.gz")
 annos <- get_annos("Nodal_Signalling_GO.csv")
 SNP_filter <- SNP_filter(SNP_data, annos, CodingVariants())
ADD COMMENT

Login before adding your answer.

Traffic: 2390 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6