Optimize cumsum with large data

97 Views Asked by At

I'm trying to write a code that get every 7th cumulative sum value from a vector, but it's running very slowly using a loop structure, because my data is so large.

Here is the code that i write. The results are the expected but, as i said, it is running very slowly:

for (i in 1:50000) {
  B[i] <- cumsum(A[i:50000])[7]
}

Example: A = 4 1 2 2 6 2 6 6 1 10 5 8 6 9 1

The results must be:

B[1] = (4+1+2+2+6+2+6) = 23

B[2] = (1+2+2+6+2+6+6) = 25

and so...

Thank you.

3

There are 3 best solutions below

2
Onyambu On

Base R:

use filter:

c(na.omit(filter(A, rep(1,7)))) # use stats::filter if tidyverse is loaded
[1] 23 25 25 33 36 38 42 45 40

use convolve

n <- 7
l <- length(A)
p <- l - n + 1
convolve(A, rep(1:0,c(n, p - 1)))[1:p]
[1] 23 25 25 33 36 38 42 45 40

use embed:

rowSums(embed(A, 7))
[1] 23 25 25 33 36 38 42 45 40

use sapply:

i <- 1:7 - 1
sapply(1:p, \(x)sum(A[x+i]))
[1] 23 25 25 33 36 38 42 45 40

Other packages:

zoo::rollsum(A, 7)
[1] 23 25 25 33 36 38 42 45 40
0
jblood94 On

All you need is cumsum:

cs <- c(0, cumsum(A))
cs[8:length(cs)] - cs[1:(length(cs) - 7)]
#> [1] 23 25 25 33 36 38 42 45 40

diff with cumsum also works:

diff(c(0, cumsum(A)), 7)
#> [1] 23 25 25 33 36 38 42 45 40

Benchmarking

Functions:

cumsumrollsum <- function(x, window) {
  cs <- c(vector(mode(x), 1), cumsum(x))
  cs[(window + 1):length(cs)] - cs[1:(length(cs) - window)]
}

filterrollsum <- function(x, window) c(na.omit(filter(x, rep(1, window))))

convolverollsum <- function(x, window) {
  p <- length(x) - window + 1
  convolve(x, rep(1:0, c(window, p - 1)))[1:p]
}

embedrollsum <- function(x, window) rowSums(embed(x, window))

sapplyrollsum <- function(x, window) {
  i <- 1:window - 1
  sapply(1:(length(x) - window + 1), \(y) sum(x[y + i]))
}

reducerollsum <- function(x, window) {
  Reduce(`+`, tail(x, -window) - head(x, -window), sum(head(x, window)),
         accumulate = TRUE)
}

library(zoo) # for `rollsum`
library(slider) # for `slide_dbl`

Data:

A <- runif(1e5)

Timings:

bench::mark(
  cumsumrollsum = cumsumrollsum(A, 7),
  diffcumsum = diff(c(0, cumsum(A)), 7),      
  filterrollsum = filterrollsum(A, 7),
  convolverollsum = convolverollsum(A, 7),
  embedrollsum = embedrollsum(A, 7),
  sapplyrollsum = sapplyrollsum(A, 7),
  reducerollsum = reducerollsum(A, 7),
  rollsum = rollsum(A, 7),
  slide_dbl = c(na.omit(slide_dbl(A, sum, .after = 6, .complete = TRUE)))
)
#> # A tibble: 9 × 6
#>   expression           min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 cumsumrollsum     2.27ms   2.65ms    281.      3.82MB    25.9 
#> 2 diffcumsum        2.08ms   3.48ms    212.      5.34MB    29.7 
#> 3 filterrollsum    12.59ms  17.18ms     48.4    10.87MB    19.4 
#> 4 convolverollsum  16.59ms  22.94ms     37.2     6.89MB     5.88
#> 5 embedrollsum     11.34ms  16.24ms     56.9      9.2MB    15.7 
#> 6 sapplyrollsum   180.82ms 191.53ms      4.55    3.29MB    16.7 
#> 7 reducerollsum    84.01ms  98.58ms     10.3     4.61MB    15.5 
#> 8 rollsum         181.46ms 190.58ms      4.88   33.76MB     3.26
#> 9 slide_dbl        60.52ms  64.13ms     11.1     4.28MB    14.8
0
ThomasIsCoding On

Given A <- c(4, 1, 2, 2, 6, 2, 6, 6, 1, 10, 5, 8, 6, 9, 1), you can try Reduce like below

> Reduce(`+`, tail(A, -7) - head(A, -7), sum(head(A, 7)), accumulate = TRUE)
[1] 23 25 25 33 36 38 42 45 40

or a similar logic but more speedy

> cumsum(c(sum(head(A, 7)),tail(A, -7) - head(A, -7)))
[1] 23 25 25 33 36 38 42 45 40

If you want to be even faster, you can try Rcpp like below

library(Rcpp)
sourceCpp(code = "
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector csumCpp(NumericVector x, int window) {
  int n = x.size();
  NumericVector res(n-window+1);

  double init = sum(x[Range(0, window - 1)]);
  res[0] = init;

  for (int i = 1; i < res.size(); i++) {
    res[i] = res[i - 1] + x[i + window - 1] - x[i - 1];
  }

  return res;
}")

Benchmarking

cumsumrollsum <- function(x, window) {
    cs <- c(vector(mode(x), 1), cumsum(x))
    cs[(window + 1):length(cs)] - cs[1:(length(cs) - window)]
}

csum <- function(x, window) {
    cumsum(c(sum(x[1:window]), x[-(1:window)] - x[1:(length(x) - window)]))
}

library(Rcpp)
sourceCpp(code = "
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector csumCpp(NumericVector x, int window) {
  int n = x.size();
  NumericVector res(n-window+1);

  double init = sum(x[Range(0, window - 1)]);
  res[0] = init;

  for (int i = 1; i < res.size(); i++) {
    res[i] = res[i - 1] + x[i + window - 1] - x[i - 1];
  }

  return res;
}")

set.seed(0)
A <- runif(1e6)
microbenchmark(
    cumsumrollsum = cumsumrollsum(A, 7),
    csum = csum(A, 7),
    csumCpp = csumCpp(A, 7),
    unit = "relative"
)

shows

Unit: relative
          expr      min       lq     mean   median       uq      max neval
 cumsumrollsum 3.092228 3.632468 3.873697 4.130823 4.524083 5.112725   100
          csum 3.434531 4.226501 4.436328 4.865234 4.751307 6.085239   100
       csumCpp 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000   100