Replace specific levels of a row with NA, based on conditions across ALL factor columns of a data frame

66 Views Asked by At

I have a large data frame, df_f, with different levels across multiple column (5 columns in this example):

c1 <- as.factor(c("NA", "1.1_mif", "NA", "NA", "1.1_mif", "1.1_mif", "NA", "1.3_sef"))
c2 <- as.factor(c("NA", "NA", "1.1_mif", "NA", "NA", "1.2_mof", "NA", "NA"))
c3 <- as.factor(c("NA", "NA", "1.2_mof", "1.2_mof", "NA", "1.3_sef", "NA", "NA"))
c4 <- as.factor(c("NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"))
c5 <- as.factor(c("NA", "1.3_sef", "NA", "1.3_sef", "NA", "NA", "NA", "NA"))
df_f <- data.frame(c1, c2, c3, c4, c5)
       c1      c2      c3     c4      c5
1      NA      NA      NA     NA      NA
2 1.1_mif      NA      NA     NA 1.3_sef
3      NA 1.1_mif 1.2_mof     NA      NA
4      NA      NA 1.2_mof     NA 1.3_sef
5 1.1_mif      NA      NA     NA      NA
6 1.1_mif 1.2_mof 1.3_sef     NA      NA
7      NA      NA      NA     NA      NA
8 1.3_sef      NA      NA     NA      NA

I am particularly interested in three levels: "1.1_mif", "1.2_mof" and "1.3_sef".

I would like to create a function that would allow me to keep the level with the highest index among the three levels above, if several of these levels are present on the same row (across ALL the columns) - and to replace the other one(s) with NA.

Expected outcome:

       c1    c2      c3      c4      c5
1      NA    NA      NA      NA      NA
2      NA    NA      NA      NA 1.3_sef
3      NA    NA 1.2_mof      NA      NA
4      NA    NA      NA      NA 1.3_sef
5 1.1_mif    NA      NA      NA      NA
6      NA    NA 1.3_sef      NA      NA
7      NA    NA      NA      NA      NA
8 1.3_sef    NA      NA      NA      NA

I'm struggling to find what would be a very efficient way to do this in R (across ALL the columns :)

2

There are 2 best solutions below

0
Onyambu On

in Base R:

fn <- function(x){
    a <- do.call(pmax, c(type.convert(df_f, as.is = TRUE), na.rm = TRUE))
    replace(df_f, df_f != a, 'NA')
 }
 fn(df_f)

       c1 c2      c3 c4      c5
1      NA NA      NA NA      NA
2      NA NA      NA NA 1.3_sef
3      NA NA 1.2_mof NA      NA
4      NA NA      NA NA 1.3_sef
5 1.1_mif NA      NA NA      NA
6      NA NA 1.3_sef NA      NA
7      NA NA      NA NA      NA
8 1.3_sef NA      NA NA      NA
2
Ben On

Assuming you have ordered factors for each column:

df_f[] <- lapply(df_f, factor, levels = c("NA", "1.1_mif", "1.2_mof", "1.3_sef"), ordered = TRUE)

You can use the following in tidyverse. Determine the max level for each row (for rows that are not all NA), and then replace values less than that for each column.

library(tidyverse)

df_f |>
  rowwise() |>
  mutate(max = max(c_across(everything()))) |>
  transmute(across(!max, ~ replace(., . < max, "NA"))) 

Note - if your data.frame has NA instead of character "NA", then you could do something similar with NA as a factor level:

df_f[df_f == "NA"] <- NA
l <- c(NA, "1.1_mif", "1.2_mof", "1.3_sef")
df_f[] <- lapply(df_f, factor, levels = l, exclude = NULL, ordered = TRUE)

df_f |>
  rowwise() |>
  mutate(max = max(c_across(everything()))) |>
  mutate(across(everything(), ~ factor(., levels = l))) |>
  transmute(across(!max, ~ replace(., . < max, NA))) 

Output

  c1      c2    c3      c4    c5     
  <ord>   <ord> <ord>   <ord> <ord>  
1 NA      NA    NA      NA    NA     
2 NA      NA    NA      NA    1.3_sef
3 NA      NA    1.2_mof NA    NA     
4 NA      NA    NA      NA    1.3_sef
5 1.1_mif NA    NA      NA    NA     
6 NA      NA    1.3_sef NA    NA     
7 NA      NA    NA      NA    NA     
8 1.3_sef NA    NA      NA    NA