Decoding multiple hot-encoded columns efficiently in R

127 Views Asked by At

I have the following data frame:

id = c(1,2,3)

where_home = c(1, 0, NA)
where_work = c(0, 1, NA)

with_alone = c(0,0,0)
with_parents = c(0,1,1)
with_colleagues = c(1,1,0)

gender_male = c(1,0,1)
gender_female = c(0,1,0)

p_affect = c(10,14,20)
n_affect = c(20,30,10)


df = data.frame(id, where_home, where_work,
                with_alone, with_parents, with_colleagues,
                gender_male, gender_female, p_affect, n_affect)

Where there are 3 IDs, and multiple hot-encoded columns (where, with, gender) along with not hot-encoded columns (p_affect, n_affect).

What I would like is to convert the hot-encoded columns while keeping the not hot-encoded ones as they are.

I did the following:

library(dplyr)

df_transformed <- df %>%
  rowwise() %>%
  mutate(Gender = case_when(
    gender_male == 1 ~ "Male",
    gender_female == 1 ~ "Female",
    TRUE ~ NA_character_
  ),
  Context = paste(
    ifelse(with_alone == 1, "Alone", ""),
    ifelse(with_parents == 1, "Parents", ""),
    ifelse(with_colleagues == 1, "Colleagues", ""),
    collapse = " and "
  ),
  Location = trimws(ifelse(
    where_home == 1 & where_work == 1, 
    'Home and Work', 
    paste(
      ifelse(where_home == 1, 'Home', ''),
      ifelse(where_work == 1, 'Work', '')
    )
  ))) %>%
  select(-starts_with("gender_"), -starts_with("with_"))

df_transformed <- df_transformed %>%
  select(id, Gender, Context, Location, p_affect, n_affect)

And the result:

     id Gender Context               Location p_affect n_affect
  <dbl> <chr>  <chr>                 <chr>       <dbl>    <dbl>
1     1 Male   "  Colleagues"        Home           10       20
2     2 Female " Parents Colleagues" Work           14       30
3     3 Male   " Parents "           NA             20       10

This seems to work, but there are a few issues:

  • some of the spacing seems strange in the 'Context' column. I would prefer a cleaner format without any spaces separated by 'and' (e.g. "Parents and Colleagues" instead of " Parents Colleagues"
  • in this approach, I need to define each column and each case separately, which is tedious, as the original data frame is massive with many columns and possible options. I would like something like:
pseudocode:

vector_of_columns_that_are_hot_encoded = c('where', 'with', 'gender')
for column in vector_of_columns:
 # modify the hot-encoded columns and make a new data frame while keeping the columns that are not in the vector_of_columns_that_are_hot_encoded as they are
# mind that some hot-encoded columns are binary (gender), while others have multiple values. If multiple values are present, put them in the data frame using "Value 1 and Value 2 and ..."

I think there has to be a simple way of doing this. As I am a beginner with dplyr, if possible to explain the code and keep it simple.

5

There are 5 best solutions below

8
jay.sf On BEST ANSWER

If the names are that nicely formatted as shown, you could strsplit at '_' and exploit pre and suffixes in by.

> cols <- 2:8
> by(names(df)[cols], sapply(strsplit(names(df)[cols], '_'), `[`, 1), \(x) {
+   sb <- sub('.*_', '', x)
+   apply(df[, x] == 1, 1, \(x) toString(unique(sb[x])))
+ }) |> do.call(what='cbind') |> cbind(df[-(cols)])
  gender where                with id p_affect n_affect
1   male  home          colleagues  1       10       20
2 female  work parents, colleagues  2       14       30
3   male    NA             parents  3       20       10

Explanation: The strsplit splits the selected variables at the "_" and the sapply around it selects the 1st element, so the (complete) variable names are split by their prefixes. The sub only leaves the suffix, so the comparison with 1 in apply selects the correct endings, a little nice with a comma using toString. Next we cbind first the created selections and finally everything to the rest of df.


Data:

> dput(df)
structure(list(id = c(1, 2, 3), where_home = c(1, 0, NA), where_work = c(0, 
1, NA), with_alone = c(0, 0, 0), with_parents = c(0, 1, 1), with_colleagues = c(1, 
1, 0), gender_male = c(1, 0, 1), gender_female = c(0, 1, 0), 
    p_affect = c(10, 14, 20), n_affect = c(20, 30, 10)), class = "data.frame", row.names = c(NA, 
-3L))
0
dufei On

With your existing code, you could apply some post-processing to adjust the formatting:

df_transformed |> 
  mutate(
    Context = str_trim(Context),
    Context = str_replace_all(Context, " ", " and ")
  )
#> # A tibble: 3 × 6
#> # Rowwise: 
#>      id Gender Context                Location p_affect n_affect
#>   <dbl> <chr>  <chr>                  <chr>       <dbl>    <dbl>
#> 1     1 Male   Colleagues             Home           10       20
#> 2     2 Female Parents and Colleagues Work           14       30
#> 3     3 Male   Parents                <NA>           20       10
2
Md Ahsanul Himel On

Edit: several lines change

id = c(1,2,3, 4)

where_home = c(1, 0, NA, 1)
where_work = c(0, 1, NA, 0)

with_alone = c(0,0,0, 1)
with_parents = c(0,1,1, 1)
with_colleagues = c(1,0,0, 1)
with_friends_and_family = c(0,1,0, 1)

gender_male = c(1,0,1, 0)
gender_female = c(0,1,0, 1)

p_affect = c(10,14,20, 12)
n_affect = c(20,30,10, 16)


df = data.frame(id, where_home, where_work,
                with_alone, with_parents, with_colleagues, with_friends_and_family, 
                gender_male, gender_female, p_affect, n_affect)

library(tidyverse)
    
# Edit the following two lines to add new variables or remove
columns_are_hot_encoded  <- c("where", "with", "gender")
columns_are_not_hot_encoded <- c("p_affect", "n_affect")
  
hot_encoded <- paste0("^(", paste(columns_are_hot_encoded, collapse = "|"), ")")
    
# got this part from dufei
df_long <- df %>%  
  pivot_longer(
    # starts_with("where") | starts_with("with") | starts_with("gender"),
    matches(hot_encoded),
    names_to = c("VariableName", "Values"),
    names_pattern = "(.*?)_(.*)"
  ) %>% 
  filter(is.na(value) | value == 1) %>% 
  mutate(Values = ifelse(is.na(value), NA, Values)) %>% 
  select(-value)

df_splitted <- split.data.frame(df_long, f = as.factor(df_long$VariableName))


for(i in names(df_splitted)){
  print(i)
  print(colnames(df_splitted[[i]]))
  colnames(df_splitted[[i]])[colnames(df_splitted[[i]]) == "Values"] <- i
}


result <- df_splitted %>% reduce(inner_join, by='id', relationship = "many-to-many") %>% 
  select(id, columns_are_hot_encoded, columns_are_not_hot_encoded) %>% 
  mutate(across(is.character, ~ str_to_title(sub("_and_|_", " ", .x))))

The original data was -

> df
  id where_home where_work with_alone with_parents with_colleagues
1  1          1          0          0            0               1
2  2          0          1          0            1               0
3  3         NA         NA          0            1               0
4  4          1          0          1            1               1
  with_friends_and_family gender_male gender_female p_affect n_affect
1                       0           1             0       10       20
2                       1           0             1       14       30
3                       0           1             0       20       10
4                       1           0             1       12       16

The transformed data -

> result
# A tibble: 9 × 6
     id where with           gender p_affect n_affect
  <dbl> <chr> <chr>          <chr>     <dbl>    <dbl>
1     1 Home  Colleagues     Male         10       20
2     2 Work  Parents        Female       14       30
3     2 Work  Friends Family Female       14       30
4     3 NA    Parents        Male         20       10
5     3 NA    Parents        Male         20       10
6     4 Home  Alone          Female       12       16
7     4 Home  Parents        Female       12       16
8     4 Home  Colleagues     Female       12       16
9     4 Home  Friends Family Female       12       16

To add new variables you just need to edit the vector columns_are_hot_encoded and columns_are_not_hot_encoded

Thank you!

6
dufei On

Here's another solution using a single call to pivot_longer() that should be generalizable to more variables:

library(tidyverse)

# reshape longer
df_long <- df |> 
  pivot_longer(
    starts_with("where") | starts_with("with") | starts_with("gender"),
    names_to = c(".value", "value"),
    names_pattern = "(.*)_(.*)"
  )

# summarize
df_sum <- df_long |>
  summarize(across(c(where, with, gender),
                 \(x) str_flatten(value[which(x == 1)], collapse = " and ")),
          .by = id) |> 
  mutate(across(!id, \(x) na_if(x, "")))

# merge back non-hot-encoded variables
df_sum |> 
  left_join(
    select(df, c(id, ends_with("affect"))),
    join_by(id)
  )
#> # A tibble: 3 × 6
#>      id where with                   gender p_affect n_affect
#>   <dbl> <chr> <chr>                  <chr>     <dbl>    <dbl>
#> 1     1 home  colleagues             male         10       20
#> 2     2 work  parents and colleagues female       14       30
#> 3     3 <NA>  parents                male         20       10

Created on 2023-12-01 with reprex v2.0.2

0
lotus On

You should be able to do this in a pipeline by round-trip pivoting the data with a little cleaning inbetween:

library(tidyr)
library(dplyr)
library(stringr)

df |>
  pivot_longer(cols = -c(id, ends_with("affect")),
               names_sep = "_",
               names_to = c("var", "val"),
               names_transform = str_to_sentence) |>
  filter(value == 1) |>
  summarise(val = toString(val), .by = -c(val, value)) |>
  pivot_wider(names_from = var, values_from = val)

# A tibble: 3 × 6
     id p_affect n_affect Where With                Gender
  <dbl>    <dbl>    <dbl> <chr> <chr>               <chr> 
1     1       10       20 Home  Colleagues          Male  
2     2       14       30 Work  Parents, Colleagues Female
3     3       20       10 NA    Parents             Male