Question

Converting a data frame to boolean values in R

0

Entering edit mode

3.6 years ago

zizigolu ★ 4.3k

I have a list of samples mutated for signalling pathways like below 1 if mutated and 0 if not mutated

  > dput(pathway)
structure(list(sample = c("LP6005334.DNA_H01", "LP6005334.DNA_H01", 
"LP6005334.DNA_H01", "LP6005334.DNA_H01", "LP6005334.DNA_H01", 
"LP6005334.DNA_H01", "LP6005334.DNA_H01", "LP6005334.DNA_H01", 
"LP6005334.DNA_H01", "LP6005334.DNA_H01", "LP6005334.DNA_H01", 
"LP6005334.DNA_H01", "LP6005500.DNA_D03", "LP6005500.DNA_D03", 
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6005500.DNA_D03", 
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6005500.DNA_D03", 
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6005500.DNA_D03", 
"LP6005500.DNA_D03", "LP6005500.DNA_D03", "LP6007600", "LP6007600", 
"LP6007600", "LP6007600", "LP6007600", "LP6007600", "LP6007600", 
"LP6007600", "LP6007600", "LP6007600", "LP6007600", "LP6007600", 
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03", 
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03", 
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03", 
"LP6008202.DNA_B03", "LP6008202.DNA_B03", "LP6008202.DNA_B03", 
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03", 
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03", 
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03", 
"LP6008334.DNA_A03", "LP6008334.DNA_A03", "LP6008334.DNA_A03", 
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04", 
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04", 
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04", 
"LP6008334.DNA_A04", "LP6008334.DNA_A04", "LP6008334.DNA_A04", 
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02", 
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02", 
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02", 
"LP6008334.DNA_B02", "LP6008334.DNA_B02", "LP6008334.DNA_B02", 
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02", 
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02", 
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02", 
"LP6008334.DNA_C02", "LP6008334.DNA_C02", "LP6008334.DNA_C02", 
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02", 
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02", 
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02", 
"LP6008334.DNA_D02", "LP6008334.DNA_D02", "LP6008334.DNA_D02", 
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02", 
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02", 
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02", 
"LP6008336.DNA_F02", "LP6008336.DNA_F02", "LP6008336.DNA_F02", 
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01", 
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01", 
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01", 
"LP6008336.DNA_G01", "LP6008336.DNA_G01", "LP6008336.DNA_G01", 
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01", 
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01", 
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01", 
"LP6008336.DNA_H01", "LP6008336.DNA_H01", "LP6008336.DNA_H01", 
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07", 
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07", 
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07", 
"LP6008337.DNA_A07", "LP6008337.DNA_A07", "LP6008337.DNA_A07", 
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06", 
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06", 
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06", 
"LP6008337.DNA_H06", "LP6008337.DNA_H06", "LP6008337.DNA_H06", 
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04", 
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04", 
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04", 
"LP6008460.DNA_A04", "LP6008460.DNA_A04", "LP6008460.DNA_A04", 
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01", 
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01", 
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01", 
"LP6008460.DNA_D01", "LP6008460.DNA_D01", "LP6008460.DNA_D01", 
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02", 
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02", 
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02", 
"LP6008460.DNA_F02", "LP6008460.DNA_F02", "LP6008460.DNA_F02", 
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03", 
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03", 
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03", 
"LP6008460.DNA_G03", "LP6008460.DNA_G03", "LP6008460.DNA_G03", 
"s15", "s15", "s15", "s15", "s15", "s15", "s15", "s15", "s15", 
"s15", "s15", "s15", "s15", "s15", "s18", "s18", "s18", "s18", 
"s18", "s18", "s18", "s18", "s18", "s18", "s18", "s18", "s18", 
"s18", "s18", "s18", "s24", "s24", "s24", "s24", "s24", "s24", 
"s24", "s24", "s24", "s24", "s24", "s24", "s24", "s24", "s24", 
"s30", "s30", "s30", "s30", "s30", "s30", "s30", "s30", "s30", 
"s30", "s30", "s30", "s30", "s30", "s59", "s59", "s59", "s59", 
"s59", "s59", "s59", "s59", "s59", "s59", "s59", "s59", "s67", 
"s67", "s67", "s67", "s67", "s67", "s67", "s67", "s67", "s67", 
"s67", "s67", "s67", "s67", "s80", "s80", "s80", "s80", "s80", 
"s80", "s80", "s80", "s80", "s80", "s80", "s80", "s80", "s80", 
"s80", "s80", "s86", "s86", "s86", "s86", "s86", "s86", "s86", 
"s86", "s86", "s86", "s86", "s86", "s86", "s86", "s94", "s94", 
"s94", "s94", "s94", "s94", "s94", "s94", "s94", "s94", "s94", 
"s94", "s94", "s94"), Pathway = c("PI3K", "Cell_Cycle", "RTK-RAS", 
"WNT", "TGF-Beta", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", 
"NRF2", "PI3K", "Cell_Cycle", "WNT", "TGF-Beta", "RTK-RAS", "WNT", 
"CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "TGF-Beta", 
"PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", 
"Hippo", "MYC", "NRF2", "PI3K", "TGF-Beta", "RTK-RAS", "WNT", 
"CR", "CF", "TP53", "Cell_Cycle", "NOTCH", "Hippo", "MYC", "NRF2", 
"Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF", 
"TP53", "NOTCH", "Hippo", "MYC", "NRF2", "RTK-RAS", "TGF-Beta", 
"PI3K", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", "Hippo", 
"MYC", "NRF2", "Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", 
"CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "PI3K", 
"Cell_Cycle", "RTK-RAS", "TGF-Beta", "WNT", "CR", "CF", "TP53", 
"NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "TGF-Beta", 
"RTK-RAS", "WNT", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", 
"NRF2", "Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", 
"CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "TGF-Beta", "PI3K", 
"RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", 
"Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "WNT", "TGF-Beta", 
"RTK-RAS", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", 
"PI3K", "TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", 
"NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "RTK-RAS", "TGF-Beta", 
"PI3K", "WNT", "CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", 
"Cell_Cycle", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF", 
"TP53", "NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "CF", 
"CR", "Hippo", "MYC", "NOTCH", "NRF2", "PI3K", "RTK-RAS", "TGF-Beta", 
"TP53", "WNT", "Cell_Cycle", "RTK-RAS", "TGF-Beta", "PI3K", "WNT", 
"CR", "CF", "TP53", "NOTCH", "Hippo", "MYC", "NRF2", "RTK-RAS", 
"TGF-Beta", "PI3K", "WNT", "CR", "CF", "TP53", "Cell_Cycle", 
"NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "RTK-RAS", "TGF-Beta", 
"PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", 
"Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "RTK-RAS", "WNT", 
"TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", 
"NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "PI3K", "Cell_Cycle", 
"RTK-RAS", "TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", 
"Cell_Cycle", "NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", 
"WNT", "TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", 
"Cell_Cycle", "NOTCH", "Hippo", "MYC", "NRF2", "TGF-Beta", "PI3K", 
"RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", 
"Hippo", "MYC", "NRF2", "Cell_Cycle", "WNT", "TGF-Beta", "PI3K", 
"RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", 
"Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "RTK-RAS", "WNT", 
"TGF-Beta", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", 
"NOTCH", "Hippo", "MYC", "NRF2", "PI3K", "Cell_Cycle", "WNT", 
"TGF-Beta", "PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", 
"NOTCH", "Hippo", "MYC", "NRF2", "Cell_Cycle", "RTK-RAS", "TGF-Beta", 
"PI3K", "RTK-RAS", "WNT", "CR", "CF", "TP53", "Cell_Cycle", "NOTCH", 
"Hippo", "MYC", "NRF2"), value = c(1L, 1L, 1L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -346L))

I want to convert this to a boolean matrix (wide format) in which pathways are in columns and samples are in rows

A sample obtains 1 if mutated for a pathway and 0 if not mutated for a pathway like

> head(p)
             sample value Cell_Cycle Hippo MYC NOTCH NRF2 PI3K TGF-Beta RTK-RAS TP53
1 LP6008334.DNA_C02     0          0     0   0     0    0    0        0       0    0
2 LP6008334.DNA_A03     0          0     0   0     0    0    0        0       0    0
3 LP6005334.DNA_H01     0          0     0   0     0    0    0        0       0    0
4 LP6008337.DNA_H06     0          0     0   0     0    0    0        0       0    0
5               s15     0          0     0   0     0    0    0        0       0    0
6 LP6008460.DNA_D01     0          0     0   0     0    0    0        0       0    0
  WNT CF CR
1   0  0  0
2   0  0  0
3   0  0  0
4   0  0  0
5   0  0  0
6   0  0  0
>

I have tried this But all I get is zero

for (pathway in setdiff(unique(p$Pathway), colnames(p))) {
  p <- cbind(p,array(0,nrow(p)))
  colnames(p)[ncol(p)] <- pathway
}

I also have been trying

reshape(pathway, idvar = "sample",   timevar = "Pathway",  direction = "wide")

But gives wrong things of 0 and 1

Please help me

dplyr reshape r software error • 1.3k views

ADD COMMENT • link updated 3.6 years ago by rpolicastro 13k • written 3.6 years ago by zizigolu ★ 4.3k

0

Entering edit mode

Hi,

In your data you have the same sample with the same pathway, with different values. This makes sense to you?

              sample                Pathway value
           15 LP6005500.DNA_D03     WNT     1
           18 LP6005500.DNA_D03     WNT     0

This cannot be converted into a wider format.

António

ADD REPLY • link 3.6 years ago by antonioggsousa 3.2k

0

Entering edit mode

Thank you

The main data comes from here

First a matrix of boolean values for mutation status of each gene in each sample like in which I merged the corresponding pathway to the gene in one column

> dput(head(d1))
structure(list(Gene = c("ACVR2A", "AKT3", "ALK", "APC", "ARHGAP35", 
"ARID1A"), LP6008334.DNA_C02 = c(0L, 1L, 0L, 0L, 0L, 0L), LP6008334.DNA_A03 = c(0L, 
0L, 0L, 0L, 0L, 0L), LP6005334.DNA_H01 = c(0L, 1L, 0L, 0L, 0L, 
0L), LP6008337.DNA_H06 = c(0L, 0L, 0L, 0L, 0L, 0L), s15 = c(0L, 
0L, 0L, 0L, 0L, 0L), LP6008460.DNA_D01 = c(0L, 0L, 0L, 0L, 0L, 
0L), s86 = c(0L, 0L, 0L, 0L, 0L, 0L), s30 = c(0L, 0L, 0L, 0L, 
0L, 0L), LP6008460.DNA_A04 = c(0L, 0L, 0L, 0L, 0L, 0L), s24 = c(0L, 
1L, 0L, 0L, 0L, 0L), LP6008334.DNA_A04 = c(0L, 0L, 0L, 0L, 0L, 
0L), LP6008336.DNA_H01 = c(0L, 1L, 0L, 0L, 0L, 0L), LP6005500.DNA_D03 = c(0L, 
1L, 0L, 0L, 0L, 0L), LP6008334.DNA_D02 = c(0L, 1L, 0L, 0L, 0L, 
0L), LP6008334.DNA_B02 = c(0L, 0L, 0L, 0L, 0L, 0L), LP6008337.DNA_A07 = c(0L, 
1L, 0L, 0L, 0L, 0L), LP6008336.DNA_G01 = c(0L, 0L, 0L, 0L, 0L, 
0L), LP6007600 = c(0L, 0L, 0L, 0L, 0L, 0L), s94 = c(0L, 0L, 0L, 
0L, 0L, 0L), LP6008460.DNA_F02 = c(0L, 0L, 0L, 0L, 0L, 0L), LP6008336.DNA_F02 = c(0L, 
0L, 0L, 0L, 0L, 0L), s67 = c(0L, 0L, 0L, 0L, 0L, 0L), s18 = c(0L, 
1L, 0L, 0L, 0L, 0L), s80 = c(0L, 1L, 0L, 0L, 0L, 0L), LP6008460.DNA_G03 = c(0L, 
0L, 0L, 0L, 0L, 0L), LP6008202.DNA_B03 = c(0L, 1L, 0L, 0L, 0L, 
0L), s59 = c(0L, 0L, 0L, 0L, 0L, 0L), Pathway = c("TGF-Beta", 
"PI3K", "RTK-RAS", "WNT", "RTK-RAS", "CR")), row.names = c(NA, 
6L), class = "data.frame")
>

I then melt that like

 > melt(d1)
    Using Gene, Pathway as id variables
        Gene  Pathway          variable value
    1   ACVR2A TGF-Beta LP6008334.DNA_C02     0
    2     AKT3     PI3K LP6008334.DNA_C02     1
    3      ALK  RTK-RAS LP6008334.DNA_C02     0
    4      APC      WNT LP6008334.DNA_C02     0
    5 ARHGAP35  RTK-RAS LP6008334.DNA_C02     0
    6   ARID1A       CR LP6008334.DNA_C02     0

Do you think from the first boolean matrix of mutational status I can build boolean values of pathway instead of feeding the melt results in tidy verse or whatever?

ADD REPLY • link 3.6 years ago by zizigolu ★ 4.3k

1

Entering edit mode

I think that I identified your problem. So, the thing is that you've mutations per sample/variable per gene per pathway. So, the same pathway in the same sample/variable can have a 0 and 1, because it can have 2 genes belonging to the same pathway. So, you cannot just drop your genes and have a table with samples per pathways.

So, depending on what you'll do next with this wider format, you can merge the gene names with pathway or sample, in order to make unique combinations of pathway-gene or sample-gene. This would allow you to keep all the information instead of choosing randomly which genes represent in the wider format (excluding this information from the table).

So, I don't think that you need to melt. So, if you think that my suggestion makes sense, regarding the combination of pathway-gene names into one label, in order to create unique labels, you just need to do (assume that your unmelted data frame object d1 is data):

# give rownames: "Pathway_Gene" (combine Pathway and gene names)
rownames(data) <- paste0(data[,"Pathway"], "_", data[,"Gene"]) # you might use other 
#character rather than "_" to separate 'Pathway' and 'Gene'
data <- data[,-which(colnames(data) %in% c("Gene", "Pathway"))] # exclude "Pathway" and "Gene" columns
data_final <- t(data) # transpose

I hope this is what you're looking for,

António

ADD REPLY • link 3.6 years ago by antonioggsousa 3.2k

0

Entering edit mode

Sorry when I tried your code with my complete d1 rather than its header pasted here I got this error

> data=d1
> d1=as.data.frame(d1)
> rownames(data) <- paste0(data[,"Pathway"], "_", data[,"Gene"]) # you might use other 
Error in `.rowNamesDF<-`(x, value = value) : 
  duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘CF_ATM’, ‘CF_CHEK2’, ‘CR_SMARCC2’ 
>

ADD REPLY • link 3.6 years ago by zizigolu ★ 4.3k

0

Entering edit mode

You have certain samples where you have the Pathway both mutated and not mutated. You need to resolve this before pivoting.

> df %>% group_by(sample, Pathway) %>% filter(n() > 1) %>%  arrange(sample, Pathway)
# A tibble: 44 x 3
# Groups:   sample, Pathway [22]
   sample            Pathway    value
   <chr>             <chr>      <int>
 1 LP6005500.DNA_D03 WNT            1
 2 LP6005500.DNA_D03 WNT            0
 3 s15               Cell_Cycle     1
 4 s15               Cell_Cycle     0
 5 s15               RTK-RAS        1
 6 s15               RTK-RAS        0
 7 s18               Cell_Cycle     1
 8 s18               Cell_Cycle     0
 9 s18               PI3K           1
10 s18               PI3K           0
# … with 34 more rows

ADD REPLY • link 3.6 years ago by rpolicastro 13k

0

Entering edit mode

Sorry the problem is I don't know how to resolve this Is there any code to identify such a samples and remove them before widening? Because doing that manually is tedious and error prone

ADD REPLY • link 3.6 years ago by zizigolu ★ 4.3k

score 1 · Answer 1 · 2020-09-04

Here is a tidyverse solution using your original data from the comments. You have multiple genes in your original data that you didn't take into consideration. Here I append them to pathway name so all values are either 0 or 1.

Data from comment.

df <- structure(list(Gene = c("ACVR2A", "AKT3", "ALK", "APC", "ARHGAP35", 
"ARID1A"), LP6008334.DNA_C02 = c(0L, 1L, 0L, 0L, 0L, 0L), LP6008334.DNA_A03 = c(0L, 
0L, 0L, 0L, 0L, 0L), LP6005334.DNA_H01 = c(0L, 1L, 0L, 0L, 0L, 
0L), LP6008337.DNA_H06 = c(0L, 0L, 0L, 0L, 0L, 0L), s15 = c(0L, 
0L, 0L, 0L, 0L, 0L), LP6008460.DNA_D01 = c(0L, 0L, 0L, 0L, 0L, 
0L), s86 = c(0L, 0L, 0L, 0L, 0L, 0L), s30 = c(0L, 0L, 0L, 0L, 
0L, 0L), LP6008460.DNA_A04 = c(0L, 0L, 0L, 0L, 0L, 0L), s24 = c(0L, 
1L, 0L, 0L, 0L, 0L), LP6008334.DNA_A04 = c(0L, 0L, 0L, 0L, 0L, 
0L), LP6008336.DNA_H01 = c(0L, 1L, 0L, 0L, 0L, 0L), LP6005500.DNA_D03 = c(0L, 
1L, 0L, 0L, 0L, 0L), LP6008334.DNA_D02 = c(0L, 1L, 0L, 0L, 0L, 
0L), LP6008334.DNA_B02 = c(0L, 0L, 0L, 0L, 0L, 0L), LP6008337.DNA_A07 = c(0L, 
1L, 0L, 0L, 0L, 0L), LP6008336.DNA_G01 = c(0L, 0L, 0L, 0L, 0L, 
0L), LP6007600 = c(0L, 0L, 0L, 0L, 0L, 0L), s94 = c(0L, 0L, 0L, 
0L, 0L, 0L), LP6008460.DNA_F02 = c(0L, 0L, 0L, 0L, 0L, 0L), LP6008336.DNA_F02 = c(0L, 
0L, 0L, 0L, 0L, 0L), s67 = c(0L, 0L, 0L, 0L, 0L, 0L), s18 = c(0L, 
1L, 0L, 0L, 0L, 0L), s80 = c(0L, 1L, 0L, 0L, 0L, 0L), LP6008460.DNA_G03 = c(0L, 
0L, 0L, 0L, 0L, 0L), LP6008202.DNA_B03 = c(0L, 1L, 0L, 0L, 0L, 
0L), s59 = c(0L, 0L, 0L, 0L, 0L, 0L), Pathway = c("TGF-Beta", 
"PI3K", "RTK-RAS", "WNT", "RTK-RAS", "CR")), row.names = c(NA, 
6L), class = "data.frame")

Solution.

library("tidyverse")

df <- df %>%
  pivot_longer(!c(Gene, Pathway), names_to="sample", values_to="val") %>%
  pivot_wider(names_from=c("Gene", "Pathway"), values_from="val")

# A tibble: 27 x 7
   sample `ACVR2A_TGF-Bet… AKT3_PI3K `ALK_RTK-RAS` APC_WNT `ARHGAP35_RTK-R…
   <chr>             <int>     <int>         <int>   <int>            <int>
 1 LP600…                0         1             0       0                0
 2 LP600…                0         0             0       0                0
 3 LP600…                0         1             0       0                0
 4 LP600…                0         0             0       0                0
 5 s15                   0         0             0       0                0
 6 LP600…                0         0             0       0                0
 7 s86                   0         0             0       0                0
 8 s30                   0         0             0       0                0
 9 LP600…                0         0             0       0                0
10 s24                   0         1             0       0                0
# … with 17 more rows, and 1 more variable: ARID1A_CR <int>