Automatically Mutate list of list

46 Views Asked by At

How can you mutate the 10 columns, which contain TRUE if the gene is inside the module and FALSE if it is not?

gene_express = data.frame(gene = c('gene1', 'gene2', 'gene3', 'gene4', 'gene5', 
'gene6', 'gene7', 'gene8', 'gene9', 'gene10'), sample1 = sample(0:10,10), sample2 = sample(0:10,10), sample3 = sample(0:10,10), sample4 = sample(0:10,10)) 
module1 = c('gene1', 'gene2', 'gene10', 'gene8')
module2 = c('gene2', 'gene9', 'gene6', 'gene5', 'gene10')
module3 = c('gene4', 'gene10', 'gene1', 'gene8')
module4 = c('gene5', 'gene8', 'gene2', 'gene7', 'gene6', 'gene5', 'gene10')
module5 = c('gene2', 'gene9', 'gene6', 'gene5', 'gene10')
module6 = c('gene4', 'gene10', 'gene1', 'gene8')
Module_list = list(module1, module2, module3, module4, module5, module6)
names(Module_list) <- c('module1', 'module2', 'module3', 
'module4', 'module5', 'module6')

In reality, I have hundreds of these modules, which have been put into a named list of lists, just like my example 'Module_list'. How can I mutate the 'gene_express' data frame such that the module names become new columns containing TRUE if the gene is inside the module and FALSE if not?

The manual way is to specify the module components in the mutate function, as I have here

Current Code

gene_express %>% mutate(
module1 = case_match(gene, c("gene1", "gene2", "gene8", "gene10") ~ TRUE, .default = FALSE),
module2 = case_match(gene, c("gene2", "gene9", "gene6", "gene5", "gene10") ~ TRUE, .default = FALSE),
module3 = case_match(gene, c("gene4", "gene10", "gene1", "gene8") ~ TRUE, .default = FALSE),
module4 = case_match(gene, c("gene2", "gene9", "gene6", "gene5", "gene10") ~ TRUE, .default = FALSE),
module5 = case_match(gene, c("gene4", "gene10", "gene1", "gene8") ~ TRUE, .default = FALSE),
module6 = case_match(gene, c("gene5", "gene2", "gene7", "gene8", "gene6", "gene10") ~ TRUE, .default = FALSE))

What I want is to avoid manually specifying the module in mutate.

2

There are 2 best solutions below

2
Jon Spring On BEST ANSWER

Maybe something like this? Here, I put the list of genes by module into a data frame, then we can join to the original data and fill in the non-joined elements with FALSEs.

library(tidyverse)
Module_df <- Module_list |>
  map_dfr(as.data.frame, .id = "module") |>            # function from purrr
  rename(gene = 2)

gene_express |>
  left_join(Module_df |> mutate(val = TRUE)) |>
  pivot_wider(names_from = module, values_from = val,  # function from tidyr
              values_fn = first, values_fill = FALSE)

Result

# A tibble: 10 × 12
   gene   sample1 sample2 sample3 sample4 module1 module3 module6 module2 module4 module5 `NA` 
   <chr>    <int>   <int>   <int>   <int> <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>
 1 gene1       10       0       3       4 TRUE    TRUE    TRUE    FALSE   FALSE   FALSE   FALSE
 2 gene2        5       8       5       5 TRUE    FALSE   FALSE   TRUE    TRUE    TRUE    FALSE
 3 gene3        8       9       7       2 FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   NA   
 4 gene4        1       5       9       0 FALSE   TRUE    TRUE    FALSE   FALSE   FALSE   FALSE
 5 gene5        4       4       8       3 FALSE   FALSE   FALSE   TRUE    TRUE    TRUE    FALSE
 6 gene6        6      10       0       9 FALSE   FALSE   FALSE   TRUE    TRUE    TRUE    FALSE
 7 gene7        3       1       1       7 FALSE   FALSE   FALSE   FALSE   TRUE    FALSE   FALSE
 8 gene8        2       3       6       6 TRUE    TRUE    TRUE    FALSE   TRUE    FALSE   FALSE
 9 gene9        0       2       4       1 FALSE   FALSE   FALSE   TRUE    FALSE   TRUE    FALSE
10 gene10       7       6       2      10 TRUE    TRUE    TRUE    TRUE    TRUE    TRUE    FALSE
0
Onyambu On
gene_express %>%
   left_join(as.data.frame.matrix(table(stack(Module_list))>0) %>%
               rownames_to_column('gene'))%>%
   mutate(across(starts_with('module'), ~replace_na(.x, FALSE)))

     gene sample1 sample2 sample3 sample4 module1 module2 module3 module4 module5 module6
1   gene1       2       8       4       4    TRUE   FALSE    TRUE   FALSE   FALSE    TRUE
2   gene2       0      10       3       6    TRUE    TRUE   FALSE    TRUE    TRUE   FALSE
3   gene3       6       6       9       0   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
4   gene4       7       9       1       9   FALSE   FALSE    TRUE   FALSE   FALSE    TRUE
5   gene5      10       2       6       8   FALSE    TRUE   FALSE    TRUE    TRUE   FALSE
6   gene6       5       5       7       1   FALSE    TRUE   FALSE    TRUE    TRUE   FALSE
7   gene7       9       7       2      10   FALSE   FALSE   FALSE    TRUE   FALSE   FALSE
8   gene8       4       0       8       7    TRUE   FALSE    TRUE    TRUE   FALSE    TRUE
9   gene9       1       1       5       2   FALSE    TRUE   FALSE   FALSE    TRUE   FALSE
10 gene10       8       4      10       3    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE

Base R:

a <- stack(Module_list)
a$values <- factor(a$values, gene_express$gene)
cbind(gene_express, as.data.frame.matrix(table(a) > 0))
             
         gene sample1 sample2 sample3 sample4 module1 module2 module3 module4 module5 module6
gene1   gene1       2       8       4       4    TRUE   FALSE    TRUE   FALSE   FALSE    TRUE
gene2   gene2       0      10       3       6    TRUE    TRUE   FALSE    TRUE    TRUE   FALSE
gene3   gene3       6       6       9       0   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
gene4   gene4       7       9       1       9   FALSE   FALSE    TRUE   FALSE   FALSE    TRUE
gene5   gene5      10       2       6       8   FALSE    TRUE   FALSE    TRUE    TRUE   FALSE
gene6   gene6       5       5       7       1   FALSE    TRUE   FALSE    TRUE    TRUE   FALSE
gene7   gene7       9       7       2      10   FALSE   FALSE   FALSE    TRUE   FALSE   FALSE
gene8   gene8       4       0       8       7    TRUE   FALSE    TRUE    TRUE   FALSE    TRUE
gene9   gene9       1       1       5       2   FALSE    TRUE   FALSE   FALSE    TRUE   FALSE
gene10 gene10       8       4      10       3    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE

or even:

cbind(gene_express,sapply(Module_list, \(x)table(factor(x,gene_express$gene))>0))

        gene sample1 sample2 sample3 sample4 module1 module2 module3 module4 module5 module6
gene1   gene1       2       8       4       4    TRUE   FALSE    TRUE   FALSE   FALSE    TRUE
gene2   gene2       0      10       3       6    TRUE    TRUE   FALSE    TRUE    TRUE   FALSE
gene3   gene3       6       6       9       0   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
gene4   gene4       7       9       1       9   FALSE   FALSE    TRUE   FALSE   FALSE    TRUE
gene5   gene5      10       2       6       8   FALSE    TRUE   FALSE    TRUE    TRUE   FALSE
gene6   gene6       5       5       7       1   FALSE    TRUE   FALSE    TRUE    TRUE   FALSE
gene7   gene7       9       7       2      10   FALSE   FALSE   FALSE    TRUE   FALSE   FALSE
gene8   gene8       4       0       8       7    TRUE   FALSE    TRUE    TRUE   FALSE    TRUE
gene9   gene9       1       1       5       2   FALSE    TRUE   FALSE   FALSE    TRUE   FALSE
gene10 gene10       8       4      10       3    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE