how to get genomic locations of Sequence patterns in R
0
1
Entering edit mode
6 months ago
Alewa ▴ 170

Hi, my current implementation returns the indexes of input genomic range where "CG" are found but i want to get the genomic location matching the pattern. pls see code and example data. thank you in advance for your help

getMotif_sites <- function(ref_genome=NULL, grList=NULL, context="CG"){
        library(Biostrings)
  # Load ref genome of choice 
  genome_pkg <- switch(ref_genome,
                   mm10 = "BSgenome.Mmusculus.UCSC.mm10",
                   hg38 = "BSgenome.Hsapiens.UCSC.hg38",
                   stop("Invalid ref_genome specified")
  )

    # Load the actual genome data
    library(genome_pkg, character.only = TRUE)
    genome_data <- get(genome_pkg)


#get seq of the granges in question x=grList[[1]]
    seqGrList <- lapply(grList, function(x) {
        # print(seqnames(x))
        seqGr <- getSeq(genome_data, x) #get seqs from ref genome
        motif_locations <- Biostrings::vmatchPattern(context, seqGr) #get cpg sites


        # Calculate the starts based on the ends and the width
ends <- motif_locations@ends[[1]]
width <- nchar(context)
starts <- ends - width + 1

# Convert to GRanges
        gr <- GRanges(seqnames = seqnames(x),
                      ranges = IRanges(start = starts, end = ends))

        })

        return(GRangesList(seqGrList))
}

data

>dput(gr)
new("GRanges", seqnames = new("Rle", values = structure(1L, levels = "chr1", class = "factor"), 
    lengths = 1L, elementMetadata = NULL, metadata = list()), 
    ranges = new("IRanges", start = 177728813L, width = 2000L, 
        NAMES = NULL, elementType = "ANY", elementMetadata = NULL, 
        metadata = list()), strand = new("Rle", values = structure(3L, levels = c("+", 
    "-", "*"), class = "factor"), lengths = 1L, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = "chr1", 
        seqlengths = NA_integer_, is_circular = NA, genome = NA_character_), 
    elementMetadata = new("DFrame", rownames = NULL, nrows = 1L, 
        elementType = "ANY", elementMetadata = NULL, metadata = list(), 
        listData = structure(list(), names = character(0))), 
    elementType = "ANY", metadata = list())
R GenomicRanges Biostrings bioconductor • 323 views
ADD COMMENT

Login before adding your answer.

Traffic: 1827 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6