Question

Matched Paired Tumour-Normal DEA of BRCA using data downloaded using TCGAbiolinks

1

Entering edit mode

5.9 years ago

aluesley1 ▴ 60

I am trying to use BRCA data downloaded using TCGAbiolinks to do a differential expression analysis.

I want to do an analysis of the matched-paired tumour-normal samples but can't work out how to identify these cases from the samples I have.

This is the code I have so far and my attempt to subset the data -

query.BRCA.tumour <- GDCquery(project = "TCGA-BRCA", 
                              legacy = TRUE,
                              data.category = "Gene expression",
                              data.type = "Gene expression quantification",
                              platform = "Illumina HiSeq",
                              experimental.strategy = "RNA-Seq",
                              sample.type = "Primary solid Tumor",
                              file.type = "results")

    GDCdownload(query.BRCA.tumour, files.per.chunk = 200)

prep.BRCA.tumour <- GDCprepare(query = query.BRCA.tumour,
                          save = TRUE,
                          summarizedExperiment = TRUE,
                          save.filename = "BRCAtumour.rda")

query.BRCA.normal <- GDCquery(project = "TCGA-BRCA", 
                          legacy = TRUE,
                          data.category = "Gene expression",
                          data.type = "Gene expression quantification",
                          platform = "Illumina HiSeq",
                          experimental.strategy = "RNA-Seq",
                          sample.type = "Solid Tissue Normal",
                          file.type = "results")

GDCdownload(query.BRCA.normal, files.per.chunk = 200)

prep.BRCA.normal <- GDCprepare(query = query.BRCA.normal,
                          save = TRUE, 
                          save.filename = "BRCAnormal.rda",
                          summarizedExperiment = TRUE)

Matched.Samples.Normal <- subset(prep.BRCA.normal, 
                                 select = colData(prep.BRCA.normal)$patient 
                                 %in% colData(prep.BRCA.tumour)$patient)


Matched.Samples.Tumour<- subset(prep.BRCA.tumour, 
                                select = colData(prep.BRCA.normal)$patient 
                                %in% colData(prep.BRCA.tumour)$patient)

The download works perfectly and I have used the prep.BRCA objects for unmatched DEA analysis without any trouble.

However, the results I get for the Matched.Samples.Normal and .Tumour are RangedSummarizedExperiments with the same number of samples as the original prep.BRCA.normal and .tumour rather than the expected 112 matched paired that I know are available.

Can anyone shed some light as to why it isn't working and provide a solution?

Thank you.

TCGAbiolinks TCGA R • 3.2k views

ADD COMMENT • link updated 5.8 years ago by Kevin Blighe 87k • written 5.9 years ago by aluesley1 ▴ 60

score 3 · Answer 1 · 2018-06-21

Hello aluesley1,

When you are subsetting your tumour data, you just have the order incorrect where you are matching IDs. It should be:

Matched.Samples.Tumour<- subset(prep.BRCA.tumour, 
                                select = colData(prep.BRCA.tumour)$patient 
                                %in% colData(prep.BRCA.normal)$patient)

dim(Matched.Samples.Normal)
[1] 21022   113
dim(Matched.Samples.Tumour)
[1] 21022   113

So, there are 113 (?) matches. I have worked with this data a lot and I do have both 112 and 113 matched T-N pairs in my mind. I'm not sure why... it may differ depending on the data-type at which you're looking.

tumours <- gsub("-01[AB][0-9A-Z-]*$", "", colnames(Matched.Samples.Tumour))
  [1] "TCGA-BH-A0BW" "TCGA-BH-A1F0" "TCGA-GI-A2C9" "TCGA-BH-A1EV" "TCGA-E9-A1RB"
  [6] "TCGA-BH-A18V" "TCGA-BH-A0B5" "TCGA-E9-A1NA" "TCGA-BH-A0DK" "TCGA-BH-A1FR"
 [11] "TCGA-BH-A0BZ" "TCGA-BH-A0BQ" "TCGA-BH-A18R" "TCGA-E9-A1RF" "TCGA-E9-A1RC"
 [16] "TCGA-A7-A0DB" "TCGA-E2-A158" "TCGA-E9-A1N4" "TCGA-BH-A0DD" "TCGA-BH-A0B3"
 [21] "TCGA-E9-A1N5" "TCGA-BH-A1EU" "TCGA-E9-A1ND" "TCGA-BH-A204" "TCGA-BH-A1FE"
 [26] "TCGA-BH-A0E0" "TCGA-E9-A1RI" "TCGA-BH-A0AZ" "TCGA-BH-A0BJ" "TCGA-BH-A208"
 [31] "TCGA-A7-A13E" "TCGA-BH-A0BA" "TCGA-E9-A1R7" "TCGA-BH-A0BV" "TCGA-BH-A18K"
 [36] "TCGA-BH-A0C0" "TCGA-BH-A0BS" "TCGA-BH-A0HA" "TCGA-BH-A18M" "TCGA-BH-A0DT"
 [41] "TCGA-BH-A0DP" "TCGA-BH-A209" "TCGA-A7-A0CH" "TCGA-BH-A0DO" "TCGA-BH-A1FN"
 [46] "TCGA-AC-A2FM" "TCGA-BH-A1F2" "TCGA-E2-A1IG" "TCGA-E9-A1NG" "TCGA-BH-A18J"
 [51] "TCGA-A7-A0CE" "TCGA-BH-A1FC" "TCGA-BH-A1FG" "TCGA-E2-A1LB" "TCGA-E9-A1NF"
 [56] "TCGA-BH-A203" "TCGA-BH-A18N" "TCGA-BH-A18S" "TCGA-A7-A0D9" "TCGA-BH-A0AU"
 [61] "TCGA-AC-A2FB" "TCGA-E2-A1LH" "TCGA-BH-A0DQ" "TCGA-E2-A1L7" "TCGA-BH-A0H7"
 [66] "TCGA-BH-A0AY" "TCGA-BH-A0B7" "TCGA-E2-A153" "TCGA-BH-A1FJ" "TCGA-BH-A0DG"
 [71] "TCGA-E2-A15K" "TCGA-A7-A13G" "TCGA-BH-A18P" "TCGA-E2-A15I" "TCGA-BH-A18L"
 [76] "TCGA-BH-A0C3" "TCGA-BH-A0DH" "TCGA-BH-A0H5" "TCGA-BH-A18U" "TCGA-BH-A1FD"
 *et cetera*


normals <- gsub("-11[AB][0-9A-Z-]*$", "", colnames(Matched.Samples.Normal)) 
  [1] "TCGA-E2-A1L7" "TCGA-BH-A1FB" "TCGA-E9-A1RB" "TCGA-BH-A1F8" "TCGA-A7-A0CH"
  [6] "TCGA-BH-A1FM" "TCGA-BH-A18K" "TCGA-BH-A0DD" "TCGA-BH-A18Q" "TCGA-E9-A1RC"
 [11] "TCGA-BH-A0H7" "TCGA-BH-A0AY" "TCGA-E9-A1RD" "TCGA-BH-A0BQ" "TCGA-BH-A1EO"
 [16] "TCGA-GI-A2C8" "TCGA-E2-A153" "TCGA-E9-A1NF" "TCGA-BH-A18U" "TCGA-BH-A208"
 [21] "TCGA-BH-A1EU" "TCGA-BH-A0DQ" "TCGA-E2-A15I" "TCGA-BH-A0BC" "TCGA-BH-A1F2"
 [26] "TCGA-BH-A18J" "TCGA-BH-A1FN" "TCGA-BH-A1FD" "TCGA-BH-A1ET" "TCGA-BH-A0DP"
 [31] "TCGA-E9-A1NA" "TCGA-BH-A0H9" "TCGA-BH-A0DZ" "TCGA-BH-A0BV" "TCGA-BH-A0B3"
 [36] "TCGA-BH-A0B7" "TCGA-BH-A0C0" "TCGA-E2-A1IG" "TCGA-E9-A1NG" "TCGA-BH-A0BM"
 [41] "TCGA-E2-A1LB" "TCGA-BH-A0BA" "TCGA-BH-A18L" "TCGA-E2-A15M" "TCGA-BH-A0H5"
 [46] "TCGA-AC-A2FM" "TCGA-BH-A1FE" "TCGA-E2-A1LS" "TCGA-BH-A0HA" "TCGA-E9-A1N6"
 [51] "TCGA-BH-A1FC" "TCGA-E2-A1LH" "TCGA-BH-A203" "TCGA-BH-A0DG" "TCGA-BH-A0DH"
 [56] "TCGA-E9-A1N9" "TCGA-BH-A18P" "TCGA-BH-A18M" "TCGA-BH-A0AU" "TCGA-BH-A1EW"
 [61] "TCGA-AC-A23H" "TCGA-A7-A13G" "TCGA-BH-A0AZ" "TCGA-BH-A0B8" "TCGA-A7-A0D9"
 [66] "TCGA-BH-A0DO" "TCGA-E9-A1N4" "TCGA-BH-A0C3" "TCGA-E9-A1RH" "TCGA-E9-A1N5"
 [71] "TCGA-BH-A1FH" "TCGA-BH-A1F6" "TCGA-BH-A0B5" "TCGA-BH-A0BZ" "TCGA-A7-A0CE"
 [76] "TCGA-BH-A1F0" "TCGA-BH-A18R" "TCGA-E9-A1R7" "TCGA-BH-A18N" "TCGA-BH-A1FR"
 *et cetera*

sort(tumours) == sort(normals)

  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE