Question

Limma returned only positive logFC values

0

Entering edit mode

10 months ago

melissachua90 ▴ 60

I want to obtain the upregulated and downregulated genes using limma. However, all the DEGs returned by my code have positive LogFC and none are downregulated (negative LogFC). This observation is consistent across multiple distinct dataframes. Is there something wrong with my code?

library(limma)
library(edgeR)

group<- factor(type, levels=c("1a", "1b", "1c", "2a", "2b", "2c"))
design <- model.matrix(~0+group)
colnames(design) <- c("samp1a", "samp1b", "samp1c", "samp2a", "samp2b", "samp2c")
rownames(design) <- colnames(df)

contrasts <- makeContrasts(samp1a-samp1b,samp1a-samp1c,samp1a-samp2a,samp1a-samp2b,samp1a-samp2c, samp1b-samp1c,samp1b-samp2a,samp1b-samp2b,samp1b-samp2c,  samp1c-samp2a,samp1c-samp2b,samp1c-samp2c,samp2a-samp2b,samp2a-samp2c,samp2b-samp2c,levels=design)

mrna.dge <- DGEList(counts=mat, group=group)
keep <- filterByExpr(mrna.dge, design)
mrna.dge <- mrna.dge[keep,,keep.lib.sizes=FALSE]
mrna.dge <- calcNormFactors(mrna.dge)
logCPM <- cpm(mrna.dge, log=T)

fit <- lmFit(logCPM, design)
efit <- eBayes(fit, trend=TRUE)
efit <- na.omit(efit)
top.ranked.mrna <- topTable(efit, coef=ncol(design), number=100000)

# Get top DEGs
deg <- top.ranked.mrna[top.ranked.mrna$adj.P.Val < 0.05, ]
deg <- deg[order(deg$adj.P.Val, decreasing=F),]

# upregulated and downregulated DEGs
deg.up <- deg[deg$logFC > 0,] 
deg.down <- deg[deg$logFC < 0,]

Data:

> dput(mat[1:20,1:20])
structure(c(391.94, 8, 1, 159.46, 40686.22, 0, 2583, 27, 0, 2110, 
1360, 36, 1, 0, 0, 0, 80, 1722, 2590, 4478, 68.91, 75, 0, 247.06, 
11009.03, 0, 4720, 16, 0, 2359, 2013, 53, 0, 0, 0, 0, 151, 2615, 
2562, 4518, 71.9, 28, 33, 516.7, 3180.79, 0, 2768, 108, 0, 2591, 
2388, 33, 0, 0, 0, 0, 199, 3338, 2715, 5936, 325.6, 47, 4, 151.49, 
16771.52, 0, 1689, 8, 0, 1744, 1366, 149, 0, 0, 0, 1, 81, 1407, 
3010, 4139, 70.42, 199, 1, 243.76, 8407.38, 0, 3618, 91, 0, 2569, 
1928, 123, 0, 0, 0, 0, 92, 2144, 3096, 4825, 234.21, 9, 0, 288.14, 
3785.26, 4, 2936, 38, 0, 2658, 1338, 24, 0, 0, 1, 0, 200, 1957, 
1180, 5419, 36.75, 126, 0, 725.92, 12247.11, 1, 2889, 103, 0, 
2983, 2877, 170, 0, 0, 0, 0, 695, 4046, 10246, 10136, 177.65, 
1, 0, 142.16, 79779.59, 1, 3792, 61, 2, 1669, 1588, 1, 0, 0, 
0, 0, 182, 1804, 1958, 6224, 17.19, 37, 0, 784.96, 3815.74, 0, 
3441, 5, 0, 1406, 1990, 38, 0, 0, 0, 0, 117, 2177, 3708, 4412, 
47.84, 4, 0, 342.64, 5265.48, 4, 5856, 16, 0, 2229, 1864, 79, 
0, 0, 0, 0, 111, 3320, 3034, 4593, 672.33, 120, 3, 121.91, 16235.8, 
14, 2354, 84, 0, 2234, 1971, 85, 0, 0, 0, 0, 294, 1954, 3901, 
7452, 205.07, 2, 59, 223.86, 19282.71, 0, 9455, 11, 0, 1980, 
3503, 1, 0, 0, 0, 0, 140, 2853, 3180, 7491, 253.67, 24, 3, 180.83, 
15562.65, 1, 4012, 328, 0, 2296, 1462, 32, 0, 0, 0, 0, 127, 2362, 
3604, 3815, 64.81, 1, 2, 245.23, 15341.74, 17, 1495, 0, 0, 1219, 
2128, 100, 0, 0, 0, 0, 203, 2410, 2823, 4972, 133.26, 2, 4, 300.43, 
17762.11, 0, 4043, 4, 0, 1801, 1391, 39, 0, 0, 0, 2, 121, 1685, 
2699, 3679, 60, 4, 1, 174.82, 7955.45, 0, 1776, 1, 0, 850, 539, 
46, 0, 0, 0, 0, 16, 1314, 1323, 1405, 131, 2, 4, 61.48, 15515.27, 
0, 2059, 4, 0, 889, 506, 15, 0, 0, 0, 0, 14, 850, 793, 1320, 
109.29, 5, 0, 203.74, 35661.02, 1, 5089, 145, 0, 2370, 1167, 
44, 0, 0, 0, 1, 298, 1984, 2649, 4904, 25.81, 437, 1, 458.92, 
9677.4, 4, 409, 18, 0, 2923, 1626, 45, 1, 0, 0, 0, 284, 2055, 
1867, 9169, 396.46, 5, 1, 240.29, 97226.72, 55, 1908, 2, 0, 1437, 
1086, 38, 1, 0, 0, 3, 97, 1769, 2562, 5130), dim = c(20L, 20L
), dimnames = list(c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2M", 
"A2ML1", "A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AACSL", 
"AADAC", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1", 
"AAMP"), c("TCGA.2K.A9WE.01", "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", 
"TCGA.2Z.A9J5.01", "TCGA.2Z.A9J6.01", "TCGA.2Z.A9J7.01", "TCGA.2Z.A9J8.01", 
"TCGA.2Z.A9J9.01", "TCGA.2Z.A9JD.01", "TCGA.2Z.A9JE.01", "TCGA.2Z.A9JI.01", 
"TCGA.2Z.A9JJ.01", "TCGA.2Z.A9JL.01", "TCGA.2Z.A9JO.01", "TCGA.2Z.A9JP.01", 
"TCGA.2Z.A9JQ.01", "TCGA.2Z.A9JT.01", "TCGA.4A.A93W.01", "TCGA.4A.A93X.01", 
"TCGA.4A.A93Y.01")))

> dput(design[1:20,])
structure(c(0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), dim = c(20L, 
6L), dimnames = list(c("TCGA.2K.A9WE.01", "TCGA.2Z.A9J1.01", 
"TCGA.2Z.A9J3.01", "TCGA.2Z.A9J5.01", "TCGA.2Z.A9J6.01", "TCGA.2Z.A9J7.01", 
"TCGA.2Z.A9J8.01", "TCGA.2Z.A9J9.01", "TCGA.2Z.A9JD.01", "TCGA.2Z.A9JE.01", 
"TCGA.2Z.A9JI.01", "TCGA.2Z.A9JJ.01", "TCGA.2Z.A9JL.01", "TCGA.2Z.A9JO.01", 
"TCGA.2Z.A9JP.01", "TCGA.2Z.A9JQ.01", "TCGA.2Z.A9JT.01", "TCGA.4A.A93W.01", 
"TCGA.4A.A93X.01", "TCGA.4A.A93Y.01"), c("group1a", "group1b", 
"group1c", "group2a", "group2b", "group2c")))

edgeR differential-expression deg limma • 2.6k views

ADD COMMENT • link 9 months ago by melissachua90 ▴ 60

2

Entering edit mode

Please show the design matrix or the code used to create the design matrix.

The result you report is a common occurrence when people mistakingly construct a test for an intercept instead of for a logFC, and showing the design matrix will reveal whether or not that is true. If I had to guess, I suspect you may have used a group-means design matrix and combined it with a coefficient test instead of forming contrasts.

ADD REPLY • link 10 months ago by Gordon Smyth ★ 7.0k

0

Entering edit mode

I added the dput. Thanks for the reminder!

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

Design, not counts. Show the model.matrix. Is this even raw counts what you have there in the dput rather than any normalized values?

ADD REPLY • link 10 months ago by ATpoint 81k

0

Entering edit mode

Yeah, I added the design matrix. design <- model.matrix(~0+group). The counts is log2-normalized raw data.

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

It'd be helpful to know how many differentially expressed genes there are. If you've got a handful and they're all upregulated, I wouldn't think anything is amiss. If you've got thousands of DE genes, all upregulated, I would be very suspicious.

ADD REPLY • link 10 months ago by LChart 3.9k

0

Entering edit mode

Several hundred to 10000 DEGs. Would normalization affect logFC?

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

It shouldn't, unless there's a misspecification in design. Is it the case that the library sizes are very different between the conditions you are comparing? Can you spot-check a few genes on the raw counts or CPMs?

ADD REPLY • link 10 months ago by LChart 3.9k

0

Entering edit mode

No the library sizes across conditions should be relatively homogenous because it's from the same dataset. I clustered the samples into subtypes and am now comparing the differential expression across the subtypes.

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

score 2 · Accepted Answer · 2023-06-07

2

Entering edit mode

10 months ago

Gordon Smyth ★ 7.0k

No, your code isn't correct. You are testing one group mean equal to zero instead of testing for differences between two groups. I wonder what comparison you were trying to make?

Please see this article https://f1000research.com/articles/9-1444 or see the limma User's Guide to how to create a design matrix.

Your code has a few other problems:

The makeContrasts command couldn't possibly work because it refers to columns that don't exist in the design matrix
The code creates contrasts but never uses them.
It is not meaningful to apply na.omit to a limma fitted model object, nor is it at all necessary
cpm() can only be run on counts but the contents of mat don't look like counts. If the expresson values are already log-transformed or normalized in some way, then running cpm is nonsensical.
Same remarks for DGEList, filterByExpr and calcNormFactors. These functions can only be used on counts, not on normalized expression values.

Postscript

OP has changed the mat output in the question since my answer was posted so that it now shows counts intead of log-normalized quantities.

ADD COMMENT • link 10 months ago by Gordon Smyth ★ 7.0k

0

Entering edit mode

I clustered the samples into subtypes and am now comparing the differential expression across the subtypes.

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

Bottom line is, if you are using an intercept-free model (that is what ~0+... is doing) then you have to use contrasts. In a "normal" model with intercept one of the groups would be the baseline, so using a coefficient is then testing that group from coef versus the baseline (baseline is the reference group). Right now you are testing whether the particular coef is not zero which makes no sense. It would be easiest to use contrasts here, to get all the comparison you want. Again, as noted by me and Gordon, this does not look like raw counts unless it comes from a pipeline like salmon/RSEM/tximport, be sure to double-check that. My recommendation is to rework the code, make a clean environment and ensure that the entire script runs without error or warnings so you can be sure you're not using any old variables in the environment. I say this because, as Gordon says, the current contrasts command should produce an error as your design columns have the prefix "group", while the contrast command uses "samp", so if this works then because there is some leftover old design variable in the environment.

ADD REPLY • link 10 months ago by ATpoint 81k

0

Entering edit mode

The mat is log2-normalized raw data. Is it feasible? Or should I feed in counts?

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

Raw counts, raw meaning not manipulated by any transformation. See limma and edgeR manual, it clearly covers that for using limma-trend and these edgeR normalizations.

ADD REPLY • link 10 months ago by ATpoint 81k

0

Entering edit mode

Thank you for your comments. I've updated my code:

group <- factor(type, levels=c("1a", "1b", "1c", "2a", "2b", "2c"))
design <- model.matrix(~0+group)
colnames(design) <- c("group1a", "group1b", "group1c", "group2a", "group2b", "group2c")
rownames(design) <- colnames(mat)

contrasts <- makeContrasts(group1a-group1b,group1a-group1c,group1a-group2a,group1a-group2b,group1a-group2c,
                           group1b-group1c,group1b-group2a,group1b-group2b,group1b-group2c,
                           group1c-group2a,group1c-group2b,group1c-group2c,
                           group2a-group2b,group2a-group2c,
                           group2b-group2c,
                           levels=design)

mrna.dge <- DGEList(counts=mat, group=group)
keep <- filterByExpr(mrna.dge, design)
mrna.dge <- mrna.dge[keep,,keep.lib.sizes=FALSE]
mrna.dge <- calcNormFactors(mrna.dge)

v <- voom(mrna.dge, design, plot=TRUE)
fit <- lmFit(v, design)
efit <- eBayes(fit, trend=TRUE)
top.ranked.mrna <- topTable(efit, coef=ncol(design), number=100000)

Data:

> dput(mat[1:20,1:20])
structure(c(391.94, 8, 1, 159.46, 40686.22, 0, 2583, 27, 0, 2110, 
1360, 36, 1, 0, 0, 0, 80, 1722, 2590, 4478, 68.91, 75, 0, 247.06, 
11009.03, 0, 4720, 16, 0, 2359, 2013, 53, 0, 0, 0, 0, 151, 2615, 
2562, 4518, 71.9, 28, 33, 516.7, 3180.79, 0, 2768, 108, 0, 2591, 
2388, 33, 0, 0, 0, 0, 199, 3338, 2715, 5936, 325.6, 47, 4, 151.49, 
16771.52, 0, 1689, 8, 0, 1744, 1366, 149, 0, 0, 0, 1, 81, 1407, 
3010, 4139, 70.42, 199, 1, 243.76, 8407.38, 0, 3618, 91, 0, 2569, 
1928, 123, 0, 0, 0, 0, 92, 2144, 3096, 4825, 234.21, 9, 0, 288.14, 
3785.26, 4, 2936, 38, 0, 2658, 1338, 24, 0, 0, 1, 0, 200, 1957, 
1180, 5419, 36.75, 126, 0, 725.92, 12247.11, 1, 2889, 103, 0, 
2983, 2877, 170, 0, 0, 0, 0, 695, 4046, 10246, 10136, 177.65, 
1, 0, 142.16, 79779.59, 1, 3792, 61, 2, 1669, 1588, 1, 0, 0, 
0, 0, 182, 1804, 1958, 6224, 17.19, 37, 0, 784.96, 3815.74, 0, 
3441, 5, 0, 1406, 1990, 38, 0, 0, 0, 0, 117, 2177, 3708, 4412, 
47.84, 4, 0, 342.64, 5265.48, 4, 5856, 16, 0, 2229, 1864, 79, 
0, 0, 0, 0, 111, 3320, 3034, 4593, 672.33, 120, 3, 121.91, 16235.8, 
14, 2354, 84, 0, 2234, 1971, 85, 0, 0, 0, 0, 294, 1954, 3901, 
7452, 205.07, 2, 59, 223.86, 19282.71, 0, 9455, 11, 0, 1980, 
3503, 1, 0, 0, 0, 0, 140, 2853, 3180, 7491, 253.67, 24, 3, 180.83, 
15562.65, 1, 4012, 328, 0, 2296, 1462, 32, 0, 0, 0, 0, 127, 2362, 
3604, 3815, 64.81, 1, 2, 245.23, 15341.74, 17, 1495, 0, 0, 1219, 
2128, 100, 0, 0, 0, 0, 203, 2410, 2823, 4972, 133.26, 2, 4, 300.43, 
17762.11, 0, 4043, 4, 0, 1801, 1391, 39, 0, 0, 0, 2, 121, 1685, 
2699, 3679, 60, 4, 1, 174.82, 7955.45, 0, 1776, 1, 0, 850, 539, 
46, 0, 0, 0, 0, 16, 1314, 1323, 1405, 131, 2, 4, 61.48, 15515.27, 
0, 2059, 4, 0, 889, 506, 15, 0, 0, 0, 0, 14, 850, 793, 1320, 
109.29, 5, 0, 203.74, 35661.02, 1, 5089, 145, 0, 2370, 1167, 
44, 0, 0, 0, 1, 298, 1984, 2649, 4904, 25.81, 437, 1, 458.92, 
9677.4, 4, 409, 18, 0, 2923, 1626, 45, 1, 0, 0, 0, 284, 2055, 
1867, 9169, 396.46, 5, 1, 240.29, 97226.72, 55, 1908, 2, 0, 1437, 
1086, 38, 1, 0, 0, 3, 97, 1769, 2562, 5130), dim = c(20L, 20L
), dimnames = list(c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2M", 
"A2ML1", "A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AACSL", 
"AADAC", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1", 
"AAMP"), c("TCGA.2K.A9WE.01", "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01", 
"TCGA.2Z.A9J5.01", "TCGA.2Z.A9J6.01", "TCGA.2Z.A9J7.01", "TCGA.2Z.A9J8.01", 
"TCGA.2Z.A9J9.01", "TCGA.2Z.A9JD.01", "TCGA.2Z.A9JE.01", "TCGA.2Z.A9JI.01", 
"TCGA.2Z.A9JJ.01", "TCGA.2Z.A9JL.01", "TCGA.2Z.A9JO.01", "TCGA.2Z.A9JP.01", 
"TCGA.2Z.A9JQ.01", "TCGA.2Z.A9JT.01", "TCGA.4A.A93W.01", "TCGA.4A.A93X.01", 
"TCGA.4A.A93Y.01")))

> dput(design[1:20,])
structure(c(0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), dim = c(20L, 
6L), dimnames = list(c("TCGA.2K.A9WE.01", "TCGA.2Z.A9J1.01", 
"TCGA.2Z.A9J3.01", "TCGA.2Z.A9J5.01", "TCGA.2Z.A9J6.01", "TCGA.2Z.A9J7.01", 
"TCGA.2Z.A9J8.01", "TCGA.2Z.A9J9.01", "TCGA.2Z.A9JD.01", "TCGA.2Z.A9JE.01", 
"TCGA.2Z.A9JI.01", "TCGA.2Z.A9JJ.01", "TCGA.2Z.A9JL.01", "TCGA.2Z.A9JO.01", 
"TCGA.2Z.A9JP.01", "TCGA.2Z.A9JQ.01", "TCGA.2Z.A9JT.01", "TCGA.4A.A93W.01", 
"TCGA.4A.A93X.01", "TCGA.4A.A93Y.01"), c("group1a", "group1b", 
"group1c", "group2a", "group2b", "group2c")))

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

No, the updated code is no more correct than your original code.

Before you undertake any differential expression analysis, you need to have clearly in mind

what sort of expression data you are inputing to the analysis (where is it from, how has it been normalized) and
what statistical hypothesis you want to test

There is still no clarity about either of these things in your post. If you were clear about these fundamental questions, then a DE analysis would be quick and simple.

ADD REPLY • link 10 months ago by Gordon Smyth ★ 7.0k

0

Entering edit mode

My expression data is raw RNA-seq counts
I'm testing the differential expression across all the groups

I added my dput

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

The code should be:

library(edgeR)
group<- factor(type)
design <- model.matrix(~group)
mrna.dge <- DGEList(counts=mat, group=group)
keep <- filterByExpr(mrna.dge, design)
mrna.dge <- mrna.dge[keep,,keep.lib.sizes=FALSE]
mrna.dge <- calcNormFactors(mrna.dge)
logCPM <- cpm(mrna.dge, log=TRUE)
fit <- lmFit(logCPM, design)
efit <- eBayes(fit, trend=TRUE)
top.ranked.mrna <- topTable(efit, number=Inf)

It is similar to but simpler than your original code.

ADD REPLY • link 10 months ago by Gordon Smyth ★ 7.0k

0

Entering edit mode

Thank you so much, Gordon!

ADD REPLY • link 10 months ago by melissachua90 ▴ 60

0

Entering edit mode

I'm still confused, unfortunately. I clustered my samples into 6 arbitrary subtypes and now want to identify the differential expression across the subtypes. To "reduce the dimensionality", I retained only the highly-correlated samples and removed outliers before running Limma. However, my code still did not return any differential expression with <0.05 adjusted P-value. Even those that have <0.05 p-value did not show up properly in my heatmap (plotted after z-scale normalization).

May I know whether I made any mistake in my code?

################
# Remove outliers #
################
rm_outlier <- \(x) {
  Q <- quantile(x, probs=c(.25, .75), na.rm=FALSE)
  iqr <- IQR(x)
  up <-  Q[2] + 1.5*iqr # Upper Range  
  low <- Q[1] - 1.5*iqr # Lower Range
  x[x <= (Q[1] - 1.5*iqr) | x >= (Q[2] + 1.5*iqr)] <- NA_real_
  x
}

mat.subset.1a <- t(apply(X=mat.1a, MARGIN=1, FUN=rm_outlier))
mat.subset.1a <- na.omit(mat.subset.1a)

mat.subset.1b <- t(apply(X=mat.1b, MARGIN=1, FUN=rm_outlier))
mat.subset.1b <- na.omit(mat.subset.1b)

mat.subset.1c <- t(apply(X=mat.1c, MARGIN=1, FUN=rm_outlier))
mat.subset.1c <- na.omit(mat.subset.1c)

mat.subset.2a <- t(apply(X=mat.2a, MARGIN=1, FUN=rm_outlier))
mat.subset.2a <- na.omit(mat.subset.2a)

mat.subset.2b <- t(apply(X=mat.2b, MARGIN=1, FUN=rm_outlier))
mat.subset.2b <- na.omit(mat.subset.2b)

mat.subset.2c <- t(apply(X=mat.2c, MARGIN=1, FUN=rm_outlier))
mat.subset.2c <- na.omit(mat.subset.2c)

merge.rows <- c(rownames(mat.subset.1a), rownames(mat.subset.1b), rownames(mat.subset.1c), rownames(mat.subset.2a), rownames(mat.subset.2b), rownames(mat.subset.2c))
merge.rows <- merge.rows[!duplicated(merge.rows)]
mat.subset <- mat[rownames(mat) %in% merge.rows,]

###########################
# Find highly correlated samples #
###########################
mat.cor.1a <- findCorrelation(cor(mat.subset.1a), cutoff=0.9)
mat.cor.1a <- mat.subset.1a[,mat.cor.1a]
mat.subset.1a <- mat.subset.1a[,order(colnames(mat.cor.1a))]
mat.cor.1b <- findCorrelation(cor(mat.subset.1b), cutoff=0.9)
mat.cor.1b <- mat.subset.1b[,mat.cor.1b]
mat.subset.1b <- mat.subset.1b[,order(colnames(mat.cor.1b))]
mat.cor.1c <- findCorrelation(cor(mat.subset.1c), cutoff=0.9)
mat.cor.1c <- mat.subset.1c[,mat.cor.1c]
mat.subset.1c <- mat.subset.1c[,order(colnames(mat.cor.1c))]
mat.cor.2a <- findCorrelation(cor(mat.subset.2a), cutoff=0.9)
mat.cor.2a <- mat.subset.2a[,mat.cor.2a]
mat.subset.2a <- mat.subset.2a[,order(colnames(mat.cor.2a))]
mat.cor.2b <- findCorrelation(cor(mat.subset.2b), cutoff=0.9)
mat.cor.2b <- mat.subset.2b[,mat.cor.2b]
mat.subset.2b <- mat.subset.2b[,order(colnames(mat.cor.2b))]
mat.cor.2c <- findCorrelation(cor(mat.subset.2c), cutoff=0.9)
mat.cor.2c <- mat.subset.2c[,mat.cor.2c]
mat.subset.2c <- mat.subset.2c[,order(colnames(mat.cor.2c))]

merge.cols <- c(colnames(mat.subset.1a), colnames(mat.subset.1b), colnames(mat.subset.1c), colnames(mat.subset.2a), colnames(mat.subset.2b), colnames(mat.subset.2c))
merge.cols <- merge.cols[!duplicated(merge.cols)]
mat.subset <- mat.subset[, colnames(mat.subset) %in% merge.cols]

clin.info.subset <- clin.info[rownames(clin.info) %in% colnames(mat.subset),]
mat.subset <- mat.subset[,colnames(mat.subset) %in% rownames(clin.info)]
mat.subset <- mat.subset[,order(match(colnames(mat.subset),colnames(mat)))]

########
# Limma #
########

    clin.subset <- clin.info[rownames(clin.info) %in% colnames(mat.subset),]
    group <- factor(clin.subset$subtype, levels=c("1a", "1b", "1c", "2a", "2b", "2c"))
    #survival <- factor(clin.subset$survival, levels=c("lts", "non-lts"))
    #design <- model.matrix(~0+group+survival)
    design <- model.matrix(~0+group)
    rownames(design) <- rownames(clin.subset)
    #colnames(design) <- c("KIRP1a", "KIRP1b", "KIRP1c", "KIRP2a", "KIRP2b", "KIRP2c", "survival")
    colnames(design) <- c("KIRP1a", "KIRP1b", "KIRP1c", "KIRP2a", "KIRP2b", "KIRP2c")

    mrna.dge <- DGEList(counts=mat.subset, group=group)
    keep <- filterByExpr(mrna.dge, design)
    mrna.dge <- mrna.dge[keep,,keep.lib.sizes=FALSE]
    mrna.dge <- calcNormFactors(mrna.dge)

    v <- voom(mrna.dge, design, plot=TRUE, normalize.method="cyclicloess")
    fit <- lmFit(v, design)
    contrasts.matrix <- makeContrasts("KIRP1a-KIRP1b", "KIRP1a-KIRP1c", "KIRP1a-KIRP2a", "KIRP1a-KIRP2b", "KIRP1a-KIRP2c",
                                      "KIRP1b-KIRP1c", "KIRP1b-KIRP2a", "KIRP1b-KIRP2b", "KIRP1b-KIRP2c",
                                      "KIRP1c-KIRP2a", "KIRP1c-KIRP2b", "KIRP1c-KIRP2c",
                                      "KIRP2a-KIRP2b", "KIRP2a-KIRP2c",
                                      "KIRP2b-KIRP2c",
                                      levels=design)
    fit2 <- contrasts.fit(fit, contrasts.matrix)
    efit <- eBayes(fit2, trend=TRUE)
    top.ranked <- topTable(efit, coef=ncol(design), number=Inf)
    top.ranked <- top.ranked[top.ranked$P.Value < 0.05, ]

# Upregulated DEGs
deg.up <- top.ranked[top.ranked$P.Value < 0.05 & top.ranked$logFC > 0, ]
deg.up <- deg.up[order(deg.up$P.Value),]

# Downregulated DEGs
deg.down <- top.ranked[top.ranked$P.Value < 0.05 & top.ranked$logFC < 0, ]
deg.down <- deg.down[order(deg.down$P.Value),]
all.deg <- rbind(deg.up, deg.down)

# Top 30 DEGs
top.deg <- rbind(deg.up[1:15,], deg.down[1:15,])
top.deg <- v$E[rownames(top.deg),]
top.deg <- top.deg[order(match(rownames(top.deg),rownames(all.deg))),]

ADD REPLY • link 9 months ago by melissachua90 ▴ 60

0

Entering edit mode

I am out here. You are ignoring the recommendations given to you so it's a waste of time. Read my lenghty comment and the answers again, or not, and apply it, or not.