pivot longer two groups of columns with the same set of suffixes, mostly an issue of correct regex

38 Views Asked by At

I have a data frame with something like:

borg_dx, borg_sx, borg_dominant, borg_nondominant

which contains the integer values of the borg scale for the right, left, dominant and non dominant side.

Then i have another set of columns called:

borg_category_dx, borg_category_sx, borg_category_dominant, borg_category_nondominant,

which has character values for the category of borg (no effort, some effort, ecc.)

df1 <- data.frame( borg_dx = c(0,1), borg_sx = c(3,1), borg_dominant = c(3,1), borg_nondominant = c(0,8), borg_category_dx = c("no effort", "no effort"), borg_category_sx = c("light effort", "no effort"), borg_category_dominant = c("light effort", "no effort"), borg_category_nondominant = c("no effort", "high effort"))

I want to pivot the data frame into a long format where one column is the side, one column is the borg value and one column is the borg category, like this:

df_long <- data.frame(side = c("dx", "sx", "dom", "nondom", "dx", "sx", "dom", "nondom"), borg_value = c(0,3,3,0,1,1,1,8), borg_category = c("no effort", "no effort", "light effort", "no effort", "light effort", "no effort", "no effort", "high effort"))

if i just wanted to pivot borg values or categories, i could just do:

pivot_longer(
      cols = starts_with("borg"),
      names_to = "side",
      names_prefix = "borg_",
      values_to = "borg_value")

but having to send all suffixes (dx, sx, dominant, nondominant) to one column called side, and the values to two columns, i found from other answers that i should use something like:

 pivot_longer(
  cols = starts_with("borg"),
  names_to = c(".value", "side"),
  names_prefix = "borg",
  names_pattern = ?????? 

except i have no idea how to phrase the regex bit. i don't know if i should just rename the columns to use just one underscore and a different separator.

1

There are 1 best solutions below

3
Till On BEST ANSWER

From your description it sounds like your data looks something like below.

dput() output for replication:

df0 <- structure(list(borg_dx = c( 0L, 2L, 2L, 7L, 5L, 6L, 0L, 2L, 10L, 7L, 5L, 7L, 3L, 4L, 3L, 9L, 5L, 1L, 4L, 5L, 9L, 10L, 3L, 6L, 0L, 3L, 7L, 5L, 0L, 10L, 7L, 2L, 4L, 8L, 0L, 8L, 3L, 2L, 5L, 7L, 5L, 2L, 1L, 4L, 5L, 6L, 9L, 1L, 9L, 10L ), 
                       borg_sx = c( 9L, 8L, 0L, 0L, 4L, 9L, 10L, 2L, 3L, 4L, 10L, 10L, 10L, 0L, 3L, 6L, 2L, 10L, 4L, 8L, 9L, 6L, 1L, 6L, 9L, 3L, 5L, 1L, 5L, 5L, 7L, 3L, 2L, 6L, 4L, 0L, 2L, 0L, 0L, 4L, 3L, 0L, 7L, 10L, 5L, 7L, 9L, 0L, 0L, 9L ), 
                       borg_dominant = c( 0L, 10L, 10L, 5L, 1L, 6L, 0L, 8L, 9L, 4L, 7L, 0L, 1L, 6L, 3L, 10L, 5L, 0L, 4L, 4L, 6L, 2L, 3L, 9L, 4L, 1L, 5L, 10L, 9L, 8L, 4L, 3L, 10L, 7L, 6L, 4L, 10L, 1L, 7L, 8L, 2L, 0L, 8L, 9L, 1L, 8L, 9L, 9L, 6L, 5L ), 
                       borg_nondominant = c( 8L, 3L, 6L, 4L, 1L, 6L, 4L, 3L, 1L, 6L, 4L, 9L, 6L, 1L, 2L, 8L, 8L, 6L, 1L, 0L, 5L, 5L, 0L, 8L, 6L, 8L, 10L, 8L, 4L, 2L, 6L, 1L, 10L, 7L, 2L, 4L, 5L, 7L, 5L, 8L, 5L, 7L, 5L, 8L, 6L, 7L, 2L, 10L, 1L, 3L ), 
                       borg_category_dx = c( "Rest", "Really Easy", "Really Easy", "Really Hard", "Hard", "Really Hard", "Rest", "Really Easy", "Maximal: Just like my hardest race", "Really Hard", "Hard", "Really Hard", "Moderate", "Sort of Hard", "Moderate", "Really, Really, Hard", "Hard", "Rest", "Sort of Hard", "Hard", "Really, Really, Hard", "Maximal: Just like my hardest race", "Moderate", "Really Hard", "Rest", "Moderate", "Really Hard", "Hard", "Rest", "Maximal: Just like my hardest race", "Really Hard", "Really Easy", "Sort of Hard", "Really, Really, Hard", "Rest", "Really, Really, Hard", "Moderate", "Really Easy", "Hard", "Really Hard", "Hard", "Really Easy", "Rest", "Sort of Hard", "Hard", "Really Hard", "Really, Really, Hard", "Rest", "Really, Really, Hard", "Maximal: Just like my hardest race" ), 
                       borg_category_sx = c( "Really, Really, Hard", "Really, Really, Hard", "Rest", "Rest", "Sort of Hard", "Really, Really, Hard", "Maximal: Just like my hardest race", "Really Easy", "Moderate", "Sort of Hard", "Maximal: Just like my hardest race", "Maximal: Just like my hardest race", "Maximal: Just like my hardest race", "Rest", "Moderate", "Really Hard", "Really Easy", "Maximal: Just like my hardest race", "Sort of Hard", "Really, Really, Hard", "Really, Really, Hard", "Really Hard", "Rest", "Really Hard", "Really, Really, Hard", "Moderate", "Hard", "Rest", "Hard", "Hard", "Really Hard", "Moderate", "Really Easy", "Really Hard", "Sort of Hard", "Rest", "Really Easy", "Rest", "Rest", "Sort of Hard", "Moderate", "Rest", "Really Hard", "Maximal: Just like my hardest race", "Hard", "Really Hard", "Really, Really, Hard", "Rest", "Rest", "Really, Really, Hard" ), 
                       borg_category_dominant = c( "Rest", "Maximal: Just like my hardest race", "Maximal: Just like my hardest race", "Hard", "Rest", "Really Hard", "Rest", "Really, Really, Hard", "Really, Really, Hard", "Sort of Hard", "Really Hard", "Rest", "Rest", "Really Hard", "Moderate", "Maximal: Just like my hardest race", "Hard", "Rest", "Sort of Hard", "Sort of Hard", "Really Hard", "Really Easy", "Moderate", "Really, Really, Hard", "Sort of Hard", "Rest", "Hard", "Maximal: Just like my hardest race", "Really, Really, Hard", "Really, Really, Hard", "Sort of Hard", "Moderate", "Maximal: Just like my hardest race", "Really Hard", "Really Hard", "Sort of Hard", "Maximal: Just like my hardest race", "Rest", "Really Hard", "Really, Really, Hard", "Really Easy", "Rest", "Really, Really, Hard", "Really, Really, Hard", "Rest", "Really, Really, Hard", "Really, Really, Hard", "Really, Really, Hard", "Really Hard", "Hard" ), 
                       borg_category_nondominant = c( "Really, Really, Hard", "Moderate", "Really Hard", "Sort of Hard", "Rest", "Really Hard", "Sort of Hard", "Moderate", "Rest", "Really Hard", "Sort of Hard", "Really, Really, Hard", "Really Hard", "Rest", "Really Easy", "Really, Really, Hard", "Really, Really, Hard", "Really Hard", "Rest", "Rest", "Hard", "Hard", "Rest", "Really, Really, Hard", "Really Hard", "Really, Really, Hard", "Maximal: Just like my hardest race", "Really, Really, Hard", "Sort of Hard", "Really Easy", "Really Hard", "Rest", "Maximal: Just like my hardest race", "Really Hard", "Really Easy", "Sort of Hard", "Hard", "Really Hard", "Hard", "Really, Really, Hard", "Hard", "Really Hard", "Hard", "Really, Really, Hard", "Really Hard", "Really Hard", "Really Easy", "Maximal: Just like my hardest race", "Rest", "Moderate" ) ), 
                  class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,-50L))

As a tibble:

dplyr::as_tibble(df0)
#> # A tibble: 50 × 8
#>    borg_dx borg_sx borg_dominant borg_nondominant borg_category_dx              
#>      <int>   <int>         <int>            <int> <chr>                         
#>  1       0       9             0                8 Rest                          
#>  2       2       8            10                3 Really Easy                   
#>  3       2       0            10                6 Really Easy                   
#>  4       7       0             5                4 Really Hard                   
#>  5       5       4             1                1 Hard                          
#>  6       6       9             6                6 Really Hard                   
#>  7       0      10             0                4 Rest                          
#>  8       2       2             8                3 Really Easy                   
#>  9      10       3             9                1 Maximal: Just like my hardest…
#> 10       7       4             4                6 Really Hard                   
#> # ℹ 40 more rows
#> # ℹ 3 more variables: borg_category_sx <chr>, borg_category_dominant <chr>,
#> #   borg_category_nondominant <chr>

The name_prefix argument of pivot_longer() removes the prefix before the name_pattern is applied. In this case the prefix is useful to tell the borg values and borg categories apart. In name_pattern we use parenthesis to define two extraction groups; the first one “(.*)” captures everything up until the second underscore, leaving everything after the underscore for the second group.

library(tidyr)

df0 |>
  pivot_longer(
    cols = starts_with("borg"),
    names_to = c(".value", "side"),
    names_pattern = "(.*)_(.*)"
  )
#> # A tibble: 200 × 3
#>    side         borg borg_category                     
#>    <chr>       <int> <chr>                             
#>  1 dx              0 Rest                              
#>  2 sx              9 Really, Really, Hard              
#>  3 dominant        0 Rest                              
#>  4 nondominant     8 Really, Really, Hard              
#>  5 dx              2 Really Easy                       
#>  6 sx              8 Really, Really, Hard              
#>  7 dominant       10 Maximal: Just like my hardest race
#>  8 nondominant     3 Moderate                          
#>  9 dx              2 Really Easy                       
#> 10 sx              0 Rest                              
#> # ℹ 190 more rows