Question: How can I separate column names in to different columns to perform downstream analysis?
1
gravatar for Wox
2.4 years ago by
Wox340
HUJI
Wox340 wrote:

I have a big data matrix and each column has named with multiple information and separated by an underscore. e.g.: Genotype, Tissue, Time, Treatment, and Replication (e.g.WT_Shoot_0t_NTrt_1)

A sample of my data frame;

structure(list(Proteins = c("SnrK", "MAPKK", "PP2C"), WT_Shoot_0t_NTrt_1 = c(0.580784899, 1.210078166, 1.505880218), WT_Shoot_0t_NTrt_2 = c(0.957816536, 1.42644091, 0.943047498), WT_Shoot_0t_NTrt_3 = c(0.559338535, 1.481513748, 1.114371918), WT_Shoot_1t_Trt_1 = c(0.831382253, 1.478551276, 0.837832395), WT_Shoot_1t_Trt_2 = c(1.180515054, 1.445100969, 1.18151722), WT_Shoot_1t_Trt_3 = c(1.332735497, 1.515484415, 0.99774335), WT_root_0t_NTrt_1 = c(1.717008073, 2.048229681, 1.448233358), WT_root_0t_NTrt_2 = c(1.431501693, 1.850835296, 1.128499829), WT_root_0t_NTrt_3 = c(1.752086402, 2.047380811, 1.190984777), WT_root_1t_Trt_1 = c(1.368684187, 1.507348975, 1.531142731), WT_root_1t_Trt_2 = c(1.204974777, 1.440904968, 1.103257306), WT_root_1t_Trt_3 = c(0.996016342, 1.630774074, 1.141581901), mut1_Shoot_0t_NTrt_1 = c(1.05451186, 1.916352545, 1.030983014), mut1_Shoot_0t_NTrt_2 = c(1.54792871, 1.676837161, 1.244400719), mut1_Shoot_0t_NTrt_3 = c(1.318611728, 1.613611, 1.28740667), mut1_Shoot_1t_Trt_1 = c(1.551790106, 1.619609895, 1.097308351), mut1_Shoot_1t_Trt_2 = c(1.638951097, 1.437759761, 1.139143972), mut1_Shoot_1t_Trt_3 = c(1.18670455, 1.530006726, 1.583110853), mut1_root_0t_NTrt_1 = c(0.981436287, 0.5156177, 0.799418798), mut1_root_0t_NTrt_2 = c(1.143837649, 0.772921721, 1.098218628), mut1_root_0t_NTrt_3 = c(1.163352788, 1.371823855, 1.278531528), mut1_root_1t_Trt_1 = c(1.13334394, 0.768721169, 1.155071974), mut1_root_1t_Trt_2 = c(1.015317761, 0.838696502, 0.9622491), mut1_root_1t_Trt_3 = c(1.961461109, 0.697184247, 0.926734427)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Proteins = structure(list(), class = c("collector_character", "collector")), WT_Shoot_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_3 = structure(list(), class = c("collector_double", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))

How can I make a table to like below to process downstream statistical analysis (i.e. ANOVA, Tukey) Sample Table

rna-seq R gene • 608 views
ADD COMMENTlink modified 2.4 years ago by tokeemdtareq40 • written 2.4 years ago by Wox340
6
gravatar for Chirag Parsania
2.4 years ago by
Chirag Parsania1.9k
University of Macau
Chirag Parsania1.9k wrote:

Convert from wide to long format, then separate delimited strings to new columns.

library(tidyverse)
dd <- structure(list(Proteins = c("SnrK", "MAPKK", "PP2C"), WT_Shoot_0t_NTrt_1 = c(0.580784899, 1.210078166, 1.505880218), WT_Shoot_0t_NTrt_2 = c(0.957816536, 1.42644091, 0.943047498), WT_Shoot_0t_NTrt_3 = c(0.559338535, 1.481513748, 1.114371918), WT_Shoot_1t_Trt_1 = c(0.831382253, 1.478551276, 0.837832395), WT_Shoot_1t_Trt_2 = c(1.180515054, 1.445100969, 1.18151722), WT_Shoot_1t_Trt_3 = c(1.332735497, 1.515484415, 0.99774335), WT_root_0t_NTrt_1 = c(1.717008073, 2.048229681, 1.448233358), WT_root_0t_NTrt_2 = c(1.431501693, 1.850835296, 1.128499829), WT_root_0t_NTrt_3 = c(1.752086402, 2.047380811, 1.190984777), WT_root_1t_Trt_1 = c(1.368684187, 1.507348975, 1.531142731), WT_root_1t_Trt_2 = c(1.204974777, 1.440904968, 1.103257306), WT_root_1t_Trt_3 = c(0.996016342, 1.630774074, 1.141581901), mut1_Shoot_0t_NTrt_1 = c(1.05451186, 1.916352545, 1.030983014), mut1_Shoot_0t_NTrt_2 = c(1.54792871, 1.676837161, 1.244400719), mut1_Shoot_0t_NTrt_3 = c(1.318611728, 1.613611, 1.28740667), mut1_Shoot_1t_Trt_1 = c(1.551790106, 1.619609895, 1.097308351), mut1_Shoot_1t_Trt_2 = c(1.638951097, 1.437759761, 1.139143972), mut1_Shoot_1t_Trt_3 = c(1.18670455, 1.530006726, 1.583110853), mut1_root_0t_NTrt_1 = c(0.981436287, 0.5156177, 0.799418798), mut1_root_0t_NTrt_2 = c(1.143837649, 0.772921721, 1.098218628), mut1_root_0t_NTrt_3 = c(1.163352788, 1.371823855, 1.278531528), mut1_root_1t_Trt_1 = c(1.13334394, 0.768721169, 1.155071974), mut1_root_1t_Trt_2 = c(1.015317761, 0.838696502, 0.9622491), mut1_root_1t_Trt_3 = c(1.961461109, 0.697184247, 0.926734427)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Proteins = structure(list(), class = c("collector_character", "collector")), WT_Shoot_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_3 = structure(list(), class = c("collector_double", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
==========
dd %>% 
  gather(var, response, WT_Shoot_0t_NTrt_1:mut1_root_1t_Trt_3) %>% 
  separate(var, c("Genotype", "Tissue", "Time", "Trtment", "Replication"), sep = "_") %>%
  arrange(desc(Proteins))


# A tibble: 72 x 7
   Proteins Genotype Tissue Time  Trtment Replication response
   <chr>    <chr>    <chr>  <chr> <chr>   <chr>          <dbl>
 1 SnrK     WT       Shoot  0t    NTrt    1              0.581
 2 SnrK     WT       Shoot  0t    NTrt    2              0.958
 4 SnrK     WT       Shoot  1t    Trt     1              0.831
 5 SnrK     WT       Shoot  1t    Trt     2              1.18

UPDATE

with tidyr 1.0.0 no need to use separate

dd %>% tidyr::pivot_longer(cols = WT_Shoot_0t_NTrt_1:mut1_root_1t_Trt_3 ,
                    names_to = c("Genotype", "Tissue", "Time", "Trtment", "Replication"),
                    values_to = "response",names_sep = "_")
ADD COMMENTlink modified 11 months ago • written 2.4 years ago by Chirag Parsania1.9k
1

The script looks tempting. But can the transformed tibble be converted into a data.frame in r?

ADD REPLYlink written 2.4 years ago by tokeemdtareq40

Yes, using as.data.frame()

ADD REPLYlink written 2.4 years ago by Chirag Parsania1.9k

Thank you, Chirag! This is what I exactly looking for. Thank a lot :)

ADD REPLYlink written 2.4 years ago by Wox340
2
gravatar for tokeemdtareq
2.4 years ago by
tokeemdtareq40
tokeemdtareq40 wrote:

I am relatively new in data analysis, maybe there is another faster way. But as a fellow biologist, I thought it would be good idea to represent a customized answer to the problem. I loaded your data as a data.frame in R, naming dat1.

dat1 = as.data.frame(structure(list(Proteins = c("SnrK", "MAPKK", "PP2C"), WT_Shoot_0t_NTrt_1 = c(0.580784899, 1.210078166, 1.505880218), WT_Shoot_0t_NTrt_2 = c(0.957816536, 1.42644091, 0.943047498), WT_Shoot_0t_NTrt_3 = c(0.559338535, 1.481513748, 1.114371918), WT_Shoot_1t_Trt_1 = c(0.831382253, 1.478551276, 0.837832395), WT_Shoot_1t_Trt_2 = c(1.180515054, 1.445100969, 1.18151722), WT_Shoot_1t_Trt_3 = c(1.332735497, 1.515484415, 0.99774335), WT_root_0t_NTrt_1 = c(1.717008073, 2.048229681, 1.448233358), WT_root_0t_NTrt_2 = c(1.431501693, 1.850835296, 1.128499829), WT_root_0t_NTrt_3 = c(1.752086402, 2.047380811, 1.190984777), WT_root_1t_Trt_1 = c(1.368684187, 1.507348975, 1.531142731), WT_root_1t_Trt_2 = c(1.204974777, 1.440904968, 1.103257306), WT_root_1t_Trt_3 = c(0.996016342, 1.630774074, 1.141581901), mut1_Shoot_0t_NTrt_1 = c(1.05451186, 1.916352545, 1.030983014), mut1_Shoot_0t_NTrt_2 = c(1.54792871, 1.676837161, 1.244400719), mut1_Shoot_0t_NTrt_3 = c(1.318611728, 1.613611, 1.28740667), mut1_Shoot_1t_Trt_1 = c(1.551790106, 1.619609895, 1.097308351), mut1_Shoot_1t_Trt_2 = c(1.638951097, 1.437759761, 1.139143972), mut1_Shoot_1t_Trt_3 = c(1.18670455, 1.530006726, 1.583110853), mut1_root_0t_NTrt_1 = c(0.981436287, 0.5156177, 0.799418798), mut1_root_0t_NTrt_2 = c(1.143837649, 0.772921721, 1.098218628), mut1_root_0t_NTrt_3 = c(1.163352788, 1.371823855, 1.278531528), mut1_root_1t_Trt_1 = c(1.13334394, 0.768721169, 1.155071974), mut1_root_1t_Trt_2 = c(1.015317761, 0.838696502, 0.9622491), mut1_root_1t_Trt_3 = c(1.961461109, 0.697184247, 0.926734427)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(Proteins = structure(list(), class = c("collector_character", "collector")), WT_Shoot_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), WT_Shoot_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), WT_root_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), WT_root_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), mut1_Shoot_1t_Trt_3 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_1 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_2 = structure(list(), class = c("collector_double", "collector")), mut1_root_0t_NTrt_3 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_1 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_2 = structure(list(), class = c("collector_double", "collector")), mut1_root_1t_Trt_3 = structure(list(), class = c("collector_double", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec")))

Here goes my code, your desired output is given as the dat4

library(reshape2)
dat2 = melt(dat1, value.name = "measurement")
dat2$sl = c(1:72)
dat3 = data.frame(do.call('rbind', strsplit(as.character(dat2$variable), '_', fixed = F)))
colnames(dat3) = c("genotype", "tissue", "time", "treatment", "replication")
dat3$sl = c(1:72)
dat4 = subset(merge(dat2, dat3, by = "sl"), select = c("Proteins", "genotype", "tissue", "time", "treatment", "replication", "measurement"))
ADD COMMENTlink modified 2.4 years ago • written 2.4 years ago by tokeemdtareq40

Thank you for the help!

ADD REPLYlink written 2.4 years ago by Wox340
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1895 users visited in the last hour