Row sums by groups through index in columns

89 Views Asked by At

I am running R in Rstudio.

I have a data frame that has the following columns: y0, y1, y2, y3, .... yn and yb0, yb1, yb2, yb3... ybn where n could be as large as 1000.

I want to do row sums such that y0=y0+yb0, y1=y1+yb1, y2=y2+yb2...., yn=yn+ybn

So, it is like updating y0, y1, y2.... yn.

I tried using rowwise() with mutate( across ()) but not working. tmpA is my data with the y's and the yb's

tmpB <- tmpA %>%
  rowwise() %>%
  mutate(
    across(starts_with("y"), ~ . + get(paste0("yb", substr(cur_column(), 2))), .names = "yc{substr(cur_column(), 2)}")
  ) %>%
  ungroup()

I don't know if the codes make sense but I've got errors and not sure how to fix it. I want to keep all variables in the original dataset but drop the 'yb's in the output. Thank you in advance.

3

There are 3 best solutions below

2
Colombo On

While it is generally frowned on for cycles, here it would add to the simplicity and readability of the solution.

Since you didn't provide sample data, we don't know if n is known beforehand or if y1, y2, ..., are necessarily sequential. So I will go with a simple solution and assume they are and n is known. This means that the numbers will go strictly from 1 to n

n = 1000 # assume `n` is known beforehand
for(i in seq_len(n)){
  df[paste0("y", i)] = df[paste0("y", i)] + df[paste0("yb", i)]
}

The df will be updated in place.

0
jay.sf On

You seem to have something like this,

> df
  y0 y1 y2 y3 y4 y5 y6 y7 y8 y9 y10 yb0 yb1 yb2 yb3 yb4 yb5 yb6 yb7 yb8 yb9 yb10
1  1  2  3  4  5  6  7  8  9 10  11  12  13  14  15  16  17  18  19  20  21   22
2  1  2  3  4  5  6  7  8  9 10  11  12  13  14  15  16  17  18  19  20  21   22
3  1  2  3  4  5  6  7  8  9 10  11  12  13  14  15  16  17  18  19  20  21   22
4  1  2  3  4  5  6  7  8  9 10  11  12  13  14  15  16  17  18  19  20  21   22

Since R is vectorized, which means it can handle arithmetic operations on data structures of different dimensions, from simple vectors to complex matrices and arrays, you can simply add the two sub-dataframes. Identify the respective columns using regular expressions in grep().

> df[grep('y\\d+', names(df))] + df[grep('yb\\d+', names(df))]
  y0 y1 y2 y3 y4 y5 y6 y7 y8 y9 y10
1 13 15 17 19 21 23 25 27 29 31  33
2 13 15 17 19 21 23 25 27 29 31  33
3 13 15 17 19 21 23 25 27 29 31  33
4 13 15 17 19 21 23 25 27 29 31  33

Note, this requires y and yb columns in the same order. If you're not sure, you can check this using

stopifnot(lapply(c('y\\d+', 'yb\\d+'), grep, x=names(df), value=TRUE) |> 
            lapply(gsub, pattern='\\D', replacement='') |> 
            do.call(what='identical'))

Data:

> dput(df)
structure(list(y0 = c(1L, 1L, 1L, 1L), y1 = c(2L, 2L, 2L, 2L), 
    y2 = c(3L, 3L, 3L, 3L), y3 = c(4L, 4L, 4L, 4L), y4 = c(5L, 
    5L, 5L, 5L), y5 = c(6L, 6L, 6L, 6L), y6 = c(7L, 7L, 7L, 7L
    ), y7 = c(8L, 8L, 8L, 8L), y8 = c(9L, 9L, 9L, 9L), y9 = c(10L, 
    10L, 10L, 10L), y10 = c(11L, 11L, 11L, 11L), yb0 = c(12L, 
    12L, 12L, 12L), yb1 = c(13L, 13L, 13L, 13L), yb2 = c(14L, 
    14L, 14L, 14L), yb3 = c(15L, 15L, 15L, 15L), yb4 = c(16L, 
    16L, 16L, 16L), yb5 = c(17L, 17L, 17L, 17L), yb6 = c(18L, 
    18L, 18L, 18L), yb7 = c(19L, 19L, 19L, 19L), yb8 = c(20L, 
    20L, 20L, 20L), yb9 = c(21L, 21L, 21L, 21L), yb10 = c(22L, 
    22L, 22L, 22L)), class = "data.frame", row.names = c(NA, 
-4L))
0
Yuriy Saraykin On

tidyverse

df <- data.frame(
          y0 = c(1L, 1L, 1L, 1L),
          y1 = c(2L, 2L, 2L, 2L),
          y2 = c(3L, 3L, 3L, 3L),
          y3 = c(4L, 4L, 4L, 4L),
          y4 = c(5L, 5L, 5L, 5L),
          y5 = c(6L, 6L, 6L, 6L),
          y6 = c(7L, 7L, 7L, 7L),
          y7 = c(8L, 8L, 8L, 8L),
          y8 = c(9L, 9L, 9L, 9L),
          y9 = c(10L, 10L, 10L, 10L),
         y10 = c(11L, 11L, 11L, 11L),
         yb0 = c(12L, 12L, 12L, 12L),
         yb1 = c(13L, 13L, 13L, 13L),
         yb2 = c(14L, 14L, 14L, 14L),
         yb3 = c(15L, 15L, 15L, 15L),
         yb4 = c(16L, 16L, 16L, 16L),
         yb5 = c(17L, 17L, 17L, 17L),
         yb6 = c(18L, 18L, 18L, 18L),
         yb7 = c(19L, 19L, 19L, 19L),
         yb8 = c(20L, 20L, 20L, 20L),
         yb9 = c(21L, 21L, 21L, 21L),
        yb10 = c(22L, 22L, 22L, 22L)
      )

library(tidyverse)
library(glue)

nmb <- str_extract(names(df), "\\d+") %>% unique()

map_dfc(nmb, ~rowSums(df[str_detect(names(df), glue("[a-z]{.x}$"))])) %>% 
  set_names(nm = str_c("y", nmb))

#> # A tibble: 4 x 11
#>      y0    y1    y2    y3    y4    y5    y6    y7    y8    y9   y10
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1    13    15    17    19    21    23    25    27    29    31    33
#> 2    13    15    17    19    21    23    25    27    29    31    33
#> 3    13    15    17    19    21    23    25    27    29    31    33
#> 4    13    15    17    19    21    23    25    27    29    31    33

or

df %>%
  mutate(idrows = row_number()) %>%
  pivot_longer(
    cols = -idrows,
    names_to = c(".value", "set"),
    names_pattern = "([a-zA-Z]+)(\\d+)"
  ) %>%
  mutate(res = rowSums(pick(starts_with("y")))) %>%
  pivot_wider(
    id_cols = idrows,
    names_from = set,
    values_from = res,
    names_prefix = "y"