Question

problem with tidyverse for gene expression profile

0

Entering edit mode

18 months ago

wanaga3166 ▴ 10

Hi all,

I have these data.frame:

module_df
structure(list(gene_id = c("A1BG", "A1CF", "A2LD1", "A2M", "A2ML1", 
"A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AADAC", "AADACL2", 
"AADACL3", "AADACL4", "AADAT"), colors = c("turquoise", "blue", 
"turquoise", "grey60", "turquoise", "blue", "turquoise", "turquoise", 
"blue", "yellow", "magenta", "turquoise", "grey", "turquoise", 
"red")), row.names = c("A1BG", "A1CF", "A2LD1", "A2M", "A2ML1", 
"A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AADAC", "AADACL2", 
"AADACL3", "AADACL4", "AADAT"), class = "data.frame")

submod
structure(list(gene_id = c("A2M", "AASS", "ABCD2", "ABCG1", "ABHD14A", 
"ABHD15", "ABHD5", "ACAA2", "ACACA", "ACACB", "ACAD11", "ACAD8", 
"ACADM", "ACADS", "ACADSB"), colors = c("grey60", "brown", "brown", 
"grey60", "brown", "brown", "brown", "brown", "brown", "brown", 
"brown", "brown", "brown", "brown", "brown")), row.names = c(4L, 
25L, 65L, 72L, 83L, 85L, 89L, 109L, 110L, 111L, 113L, 114L, 117L, 
118L, 119L), class = "data.frame")

subexpr
structure(list(GeneSymbol = c("A2M", "AASS", "ABCD2", "ABCG1", 
"ABHD14A", "ABHD15", "ABHD5", "ACAA2", "ACACA", "ACACB"), R012.EGM2 = c(50.2764951996428, 
101.665447606096, 5.14281915226989, 12.1175440994071, 127.400290446813, 
50.6721806650688, 187.444869718312, 125.028697858068, 124.640594250484, 
14.559615204624), R016.EGM2 = c(46.6399755559676, 73.1954332331196, 
5.99306742010033, 11.9097840149146, 123.360595739224, 55.5747760958692, 
195.751905602931, 130.587362354726, 162.118704445471, 15.098902880636
), V06.EGM2 = c(51.8486405122139, 70.9836654034483, 5.36299847633948, 
11.8503634313783, 112.439818697326, 51.4168701523525, 280.399638597663, 
120.828626173478, 129.803136030716, 12.9402241789224), V07.EGM2 = c(117.955683199945, 
86.7501578979337, 5.0894935767762, 12.1221347970042, 114.720275503257, 
50.1008498711308, 194.799263873044, 116.159360477615, 149.444200673643, 
14.6041181366271), V08.EGM2 = c(94.9274274162506, 70.5808603502644, 
5.49133134976474, 12.2734830659801, 105.231609450716, 44.7581009650209, 
204.585401390909, 104.92616313392, 100.125365909659, 13.4964153291074
), N179 = c(192.745422746882, 74.3515057108856, 29.365366693371, 
10.4388992737693, 104.897752588442, 57.5620930848904, 206.669342475002, 
137.362576209483, 70.0328556923676, 128.151538222808), N181 = c(215.662725276862, 
58.1513722826225, 25.2175610232832, 14.9148176681094, 98.5597745615199, 
50.3461555048319, 382.111329968073, 138.065978007562, 52.5064295370523, 
130.508317256036), N184 = c(101.949872053494, 98.4591638219055, 
28.2773164342233, 11.5115144425951, 109.929603150963, 39.3030585235286, 
176.159896274476, 167.023033236092, 120.143842776412, 132.777016208134
), V181 = c(174.216915831257, 108.294596352281, 25.9726969260387, 
13.1836455007548, 121.667169254219, 61.5397177077925, 206.185640237808, 
116.408716711222, 185.906463762304, 129.926726255828)), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

I would like to obtain a data.frame like that:

GeneSymbol | name | value | module

AASS | ASC2.jo | 54.23 | grey60

AAS2 | MSC | 24.12 | brown

However, when I execute my code, the column module is empty (full of NA). Also, I don't want gene_id column. How to correct that? Thank you for your help.

#-------------------------------------------------------------------
# Examine Expression Profiles
#-------------------------------------------------------------------

# pick out a few modules of interest here
modules_of_interest = c("grey60", "brown")

# Pull out list of genes in that module
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

subexpr = data_BE[which(data_BE$GeneSymbol%in%submod$gene_id),]

subexpr = subset(subexpr, select = -c(V179))

submod_df = data.frame(subexpr) %>%
  mutate(gene_id = row.names(.)) %>%
  pivot_longer(c(-GeneSymbol, -gene_id)) %>%
  mutate(module = module_df[gene_id,]$colors)

R tidyverse • 625 views

ADD COMMENT • link 18 months ago by wanaga3166 ▴ 10

0

Entering edit mode

My quick guess is that your error is coming from these lines:

subexpr = data_BE[which(data_BE$GeneSymbol%in%submod$gene_id),]
subexpr = subset(subexpr, select = -c(V179))

Have you run your code line-by-line and confirmed that each step is doing what you think?

ADD REPLY • link 18 months ago by Trivas ★ 1.7k

score 2 · Accepted Answer · 2022-10-25

left_join may be more effective, and you do not need to create gene_id if you do not need it finally (in addition there is a confusion between gene_id in module_df, so it is better to not create it) :

submod_df = subexpr %>%
  pivot_longer(!GeneSymbol) %>%
  left_join(module_df,by=c("GeneSymbol"="gene_id"))

# A tibble: 90 × 4
   GeneSymbol name      value colors
   <chr>      <chr>     <dbl> <chr> 
 1 A2M        R012.EGM2  50.3 grey60
 2 A2M        R016.EGM2  46.6 grey60
 3 A2M        V06.EGM2   51.8 grey60
 4 A2M        V07.EGM2  118.  grey60
 5 A2M        V08.EGM2   94.9 grey60
 6 A2M        N179      193.  grey60
 7 A2M        N181      216.  grey60
 8 A2M        N184      102.  grey60
 9 A2M        V181      174.  grey60
10 AASS       R012.EGM2 102.  NA    
# … with 80 more rows
# ℹ Use `print(n = ...)` to see more rows