Tutorial:Generating count matrix for STAR counts in GDC v32.0 for RNA-Seq
0
1
Entering edit mode
10 weeks ago
dare_devil ★ 1.6k
## Load the required library
library('TCGAbiolinks')
project_name <- "TCGA-ACC"

## Defines the query to the GDC
query <- GDCquery(project = project_name,
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification",
                  experimental.strategy = "RNA-Seq",
                  workflow.type = "STAR - Counts")

## Get metadata matrix
metadata <- query[[1]][[1]]

## Download data using api
GDCdownload(query, method = "api")

## Get main directory where data is stored
main_dir <- file.path("GDCdata", project_name)
## Get file list of downloaded files
file_list <- file.path("GDCdata", project_name,list.files(main_dir,recursive = TRUE)) 

## Read first downloaded to get gene names
test_tab <- read.table(file = file_list[1], sep = '\t', header = TRUE)
## Delete header lines that don't contain usefull information
test_tab <- test_tab[-c(1:4),]
## STAR counts and tpm datasets
tpm_data_frame <- data.frame(test_tab[,1])
count_data_frame <- data.frame(test_tab[,1])

## Append cycle to get the complete matrix
for (i in c(1:length(file_list))) {
  ## Read table
  test_tab <- read.table(file = file_list[i], sep = '\t', header = TRUE)
  ## Delete unwanted lines
  test_tab <- test_tab[-c(1:4),]
  ## Column bind of tpm and counts data
  tpm_data_frame <- cbind(tpm_data_frame, test_tab[,7])
  count_data_frame <- cbind(count_data_frame, test_tab[,4])
  ## Print progres from 0 to 1
  print(i/length(file_list))
}
TCGA Counts v32.0 Biolinks GDC STAR • 432 views
ADD COMMENT

Login before adding your answer.

Traffic: 707 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6