count number of GC for a given genomic ranges
2
0
Entering edit mode
9 months ago
Alewa ▴ 170

I have Granges with start and end but I want to count number of gc in stretch of DNA. however, i'm not able to properly subset bsgenome to be able to do that. here's my approach.

> gr_pro
GRanges object with 3 ranges and 2 metadata columns:
      seqnames              ranges strand |      symbol                    key
         <Rle>           <IRanges>  <Rle> | <character>            <character>
  [1]     chr6 143510692-143512691      - |       FUCA2 FUCA2_ENSG0000000103..
  [2]     chr6   53615972-53617971      - |        GCLC GCLC_ENSG00000001084..
  [3]     chr6   41071944-41073943      + |        NFYA NFYA_ENSG00000001167..
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

##command used
lapply(gr_pro, function(x) Biostrings::countPattern("CG", BSgenome.Hsapiens.UCSC.hg38::Hsapiens[[x]]))

Error in h(simpleError(msg, call)) : 
  error in evaluating the argument 'subject' in selecting a method for function 'countPattern': error in evaluating the argument 'i' in selecting a method for function '[[': GRanges objects don't support [[, as.list(), lapply(), or unlist() at
  the moment

pls see example data

> dput(head(gene_promoters_encode1kb_chr_subset[,c("symbol", "key")], 3))
new("GRanges", seqnames = new("Rle", values = structure(6L, .Label = c("chr1", 
"chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", 
"chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
"chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
"chrY"), class = "factor"), lengths = 3L, elementMetadata = NULL, 
    metadata = list()), ranges = new("IRanges", start = c(143510692L, 
53615972L, 41071944L), width = c(2000L, 2000L, 2000L), NAMES = NULL, 
    elementType = "ANY", elementMetadata = NULL, metadata = list()), 
    strand = new("Rle", values = structure(2:1, .Label = c("+", 
    "-", "*"), class = "factor"), lengths = 2:1, elementMetadata = NULL, 
        metadata = list()), seqinfo = new("Seqinfo", seqnames = c("chr1", 
    "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", 
    "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", 
    "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", 
    "chrY"), seqlengths = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_), is_circular = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), genome = c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_)), elementMetadata = new("DFrame", rownames = NULL, 
        nrows = 3L, listData = list(symbol = c("FUCA2", "GCLC", 
        "NFYA"), key = c("FUCA2_ENSG00000001036.13", "GCLC_ENSG00000001084.12", 
        "NFYA_ENSG00000001167.14")), elementType = "ANY", elementMetadata = NULL, 
        metadata = list()), elementType = "ANY", metadata = list())
bsgenome biostrings epigenetics • 800 views
ADD COMMENT
2
Entering edit mode
9 months ago
ATpoint 82k

There might be a better option, but here is something that combines the getSet() method from BSgenome to retrieve the DNA sequence from a BSgenome object based on a GRanges object, and then uses alphabetFrequency from Biostrings to get the number of ATCGs per GRanges entry, returning GC content as a slot in the original GRanges object:

library(Biostrings)
library(BSgenome.Ecoli.NCBI.20080805)
library(GenomicRanges)

gr <- GRanges(seqnames = rep(seqnames(BSgenome.Ecoli.NCBI.20080805)[1], 3),
              ranges = IRanges(start = c(300, 30000, 100000),
                               end = c(1000, 50000, 200000)))

getGC <- function(genome, ranges){

  require(BSgenome)
  require(Biostrings)
  require(GenomicRanges)

  s <- BSgenome::getSeq(genome, ranges)
  a <- Biostrings::alphabetFrequency(s)

  gc <- apply(a[,c("G","C")], 1, sum)
  al <- apply(a[,c("A", "T", "C", "G")], 1, sum)
  ranges$gc.content <- 100*gc/al
  ranges

}

getGC(BSgenome.Ecoli.NCBI.20080805, gr)

GRanges object with 3 ranges and 1 metadata column:
       seqnames        ranges strand | gc.content
          <Rle>     <IRanges>  <Rle> |  <numeric>
  [1] NC_008253      300-1000      * |    53.9230
  [2] NC_008253   30000-50000      * |    52.4374
  [3] NC_008253 100000-200000      * |    51.0375
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

No idea how this scales for GRanges objects with a large number of rows, probably poorly.

ADD COMMENT
2
Entering edit mode
9 months ago
Papyrus ★ 2.9k

Just in case the OP was asking for occurrences of the "GC" sequence, instead of GC content, and using the same example as shown by ATpoint, you could do:

library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)

# Example GR
gr <- GRanges(seqnames = rep(seqnames(BSgenome.Hsapiens.UCSC.hg38)[1], 3),
              ranges = IRanges(start = c(300, 30000, 100000),
                               end = c(1000, 50000, 200000)))

# Get genome sequences
seqs <- BSgenome::getSeq(BSgenome.Hsapiens.UCSC.hg38, gr)

# Count the pattern at each sequence
Biostrings::vcountPattern("GC",seqs)

# stringr alternative to count the pattern:
sapply(seqs,function(x){
  stringr::str_count(as.character(x),"GC")
})
ADD COMMENT
0
Entering edit mode

Indeed, I might have misunderstood the question.

ADD REPLY

Login before adding your answer.

Traffic: 1427 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6