R function to calculate moving average with varying number of measurements for a given time point

125 Views Asked by At

For each time point in my data frame, I have any where from 2 to 4 measurements. I want to calculate the moving average, so that for a given time point I have one value that is the average of all measurements for that time point + the time point before and the time point after.

cellcounts <-c(80, 188, 206, 162, 106,  90,  85, 109,  87,  94,  86, 196, 132, 135,  84, 122,  67,  88,  81, 121,   9,  93, 117, 91, 108, 103, 119, 100,  18,  98,  93, 119, 140, 160, 101,  82, 111, 103,  28,  72, 144,  85,   1)
time <-c(-2.7, -2.8, -2.9, -3.0, -3.1, -3.2, -3.3, -3.4, -3.5, -3.6, -2.7, -2.8, -2.9, -3.0, -3.1, -3.2, -3.3, -3.4, -3.5, -3.6, -3.9, -3.0, -3.1, -3.2, -3.3, -3.4, -3.5, -3.7, -2.5, -2.6, -2.9, -3.0, -3.2, -3.3, -3.4, -3.5, -3.7, -3.8, -2.5, -2.6, -3.7, -3.8, -3.9)
df <- data.frame(cellcounts, time)
df <- df[order(df$time),]
df
zoo::rollapply(df, width = 3, FUN = mean, align = "center", fill = NA)
5

There are 5 best solutions below

2
jblood94 On BEST ANSWER

In base R:

with(
  rle(df$time),
  {
    cs_x <- c(NA, 1L, cs_x <- cumsum(lengths) + 1L, NA)
    cs_y <- c(0, cumsum(df$cellcounts))
    data.frame(
      time = values,
      av_cellcounts = (cs_y[cs_x[-(1:3)]] - cs_y[cs_x[1:(length(cs_x) - 3L)]])/
        (cs_x[-(1:3)] - cs_x[1:(length(cs_x) - 3L)])
    )
  }
)
#>    time av_cellcounts
#> 1  -3.9            NA
#> 2  -3.8      79.00000
#> 3  -3.7     108.28571
#> 4  -3.6     104.33333
#> 5  -3.5      98.50000
#> 6  -3.4      99.16667
#> 7  -3.3     105.33333
#> 8  -3.2     106.36364
#> 9  -3.1     114.45455
#> 10 -3.0     124.70000
#> 11 -2.9     147.11111
#> 12 -2.8     140.14286
#> 13 -2.7     120.00000
#> 14 -2.6      63.66667
#> 15 -2.5            NA

Or, averaging only 2 time steps at the endpoints:

with(
  rle(df$time),
  {
    cs_x <- c(1L, 1L, cs_x <- cumsum(lengths) + 1L, cs_x[length(cs_x)])
    cs_y <- c(0, cumsum(df$cellcounts))
    data.frame(
      time = values,
      av_cellcounts = (cs_y[cs_x[-(1:3)]] - cs_y[cs_x[1:(length(cs_x) - 3L)]])/
        (cs_x[-(1:3)] - cs_x[1:(length(cs_x) - 3L)])
    )
  }
)
#>    time av_cellcounts
#> 1  -3.9      49.50000
#> 2  -3.8      79.00000
#> 3  -3.7     108.28571
#> 4  -3.6     104.33333
#> 5  -3.5      98.50000
#> 6  -3.4      99.16667
#> 7  -3.3     105.33333
#> 8  -3.2     106.36364
#> 9  -3.1     114.45455
#> 10 -3.0     124.70000
#> 11 -2.9     147.11111
#> 12 -2.8     140.14286
#> 13 -2.7     120.00000
#> 14 -2.6      63.66667
#> 15 -2.5      54.00000
1
Rui Barradas On

Something like this?

Base R

Use tapplyto compute the means by time, then compute the rolling means.

x <- with(df, tapply(cellcounts, time, mean)) |>
  zoo::rollapply(width = 3, FUN = mean, align = "center", fill = NA)

## make a data.frame from the result above
df1 <- data.frame(time = as.numeric(names(x)), cellmeans = x)
df1
#>      time cellmeans
#> -3.9 -3.9        NA
#> -3.8 -3.8  72.44444
#> -3.7 -3.7 106.61111
#> -3.6 -3.6 106.02778
#> -3.5 -3.5 100.00000
#> -3.4 -3.4  99.16667
#> -3.3 -3.3 105.33333
#> -3.2 -3.2 106.02778
#> -3.1 -3.1 113.44444
#> -3   -3.0 124.41667
#> -2.9 -2.9 154.30556
#> -2.8 -2.8 139.55556
#> -2.7 -2.7 120.00000
#> -2.6 -2.6  63.66667
#> -2.5 -2.5        NA

# another way of creating a results df
df2 <- x |> as.data.frame()
df2$time <- x |> names() |> as.numeric()
df2
#>              x time
#> -3.9        NA -3.9
#> -3.8  72.44444 -3.8
#> -3.7 106.61111 -3.7
#> -3.6 106.02778 -3.6
#> -3.5 100.00000 -3.5
#> -3.4  99.16667 -3.4
#> -3.3 105.33333 -3.3
#> -3.2 106.02778 -3.2
#> -3.1 113.44444 -3.1
#> -3   124.41667 -3.0
#> -2.9 154.30556 -2.9
#> -2.8 139.55556 -2.8
#> -2.7 120.00000 -2.7
#> -2.6  63.66667 -2.6
#> -2.5        NA -2.5

Created on 2023-09-20 with reprex v2.0.2

And with aggregate.

df3 <- aggregate(cellcounts ~ time, df, mean)
df3$cellcounts <- zoo::rollapply(df3$cellcounts, width = 3, FUN = mean, align = "center", fill = NA)
df3
#>    time cellcounts
#> 1  -3.9         NA
#> 2  -3.8   72.44444
#> 3  -3.7  106.61111
#> 4  -3.6  106.02778
#> 5  -3.5  100.00000
#> 6  -3.4   99.16667
#> 7  -3.3  105.33333
#> 8  -3.2  106.02778
#> 9  -3.1  113.44444
#> 10 -3.0  124.41667
#> 11 -2.9  154.30556
#> 12 -2.8  139.55556
#> 13 -2.7  120.00000
#> 14 -2.6   63.66667
#> 15 -2.5         NA

Created on 2023-09-20 with reprex v2.0.2


Package dplyr

Use summariseto compute the means grouped by time, then compute the rolling means.

suppressPackageStartupMessages(
  library(dplyr)
)

df %>%
  group_by(time) %>%
  summarise(cellmeans = mean(cellcounts, .groups = "drop")) %>%
  mutate(cellmeans = zoo::rollapply(cellmeans, width = 3, FUN = mean, align = "center", fill = NA))
#> # A tibble: 15 × 2
#>     time cellmeans
#>    <dbl>     <dbl>
#>  1  -3.9      NA  
#>  2  -3.8      72.4
#>  3  -3.7     107. 
#>  4  -3.6     106. 
#>  5  -3.5     100  
#>  6  -3.4      99.2
#>  7  -3.3     105. 
#>  8  -3.2     106. 
#>  9  -3.1     113. 
#> 10  -3       124. 
#> 11  -2.9     154. 
#> 12  -2.8     140. 
#> 13  -2.7     120  
#> 14  -2.6      63.7
#> 15  -2.5      NA

Created on 2023-09-20 with reprex v2.0.2

1
Till On

The zoo-FAQ recomends de-duplicating a time-series with multiple measures per time.

Here is an example that aggregate()s the duplicated measures with mean() and applies the rolling average afterward.

library(zoo)

z <- zoo(cellcounts, time)
#> Warning in zoo(cellcounts, time): some methods for "zoo" objects do not work if
#> the index entries in 'order.by' are not unique
z <- aggregate(z, identity, mean)

rollapply(
  z,
  width = 3,
  FUN = mean,
  align = "center",
  # partial = TRUE, # uncomment to include means for first and last entries
  fill = NA
)
#>      -3.9      -3.8      -3.7      -3.6      -3.5      -3.4      -3.3      -3.2 
#>        NA  72.44444 106.61111 106.02778 100.00000  99.16667 105.33333 106.02778 
#>      -3.1        -3      -2.9      -2.8      -2.7      -2.6      -2.5 
#> 113.44444 124.41667 154.30556 139.55556 120.00000  63.66667        NA
2
G. Grothendieck On

The question did not specify what the result should be so we will assume the desired result is a rolling mean vector with the same number of elements as rows in the input df. (2) and (3) compress and decompress the data so if a compressed version is wanted then just omit the rep.int (i.e. last) step in either of those two.

Note that we can't simply compress the data and use rollapply since the different runs are of different lengths. These solutions take that into account. They all use rollapply but take different approaches to using it.

(2) seems particularly short in terms of lines of code.

1) list of offsets The width argument to rollapply can be a list of vectors representing offsets to use so create that list, offsets, and then run rollapply.

We assume that the prior time is current time -0.1 and the next time is current time + 0.1 .

We convert to integer times to avoid floating point imprecision. before is the number of positions back to start and after is the number forward to end at.

library(zoo)

itime <- as.integer(10 * df$time + 1e-5) # integer time
seqno <- seq_along(itime)
before <- match(itime - 1L, itime, nomatch = seqno) - seqno
after <- findInterval(itime + 1L, itime) - seqno
offsets <- Map(seq, before, after)

means <- rollapply(df$cellcounts, offsets, mean)

2) compress/decompress Take sums and lengths over same time runs, perform rollapply on each and divide. Then decompress using rep.int .

roll <- function(x) rollapply(x, 3, sum, partial = TRUE)

means2 <- df |>
  aggregate(cbind(cellcounts, lens = 1) ~ time, data = _, sum) |>
  with((roll(cellcounts) / roll(lens)) |> rep.int(lens))

3) weighted.mean Similar to (2) but we combine the means rather than the sums by using weighted.mean.

wm <- function(x, m = rbind(x)) weighted.mean(m[, 1], m[, 2])

means3 <- df |>
 with(data.frame(means = tapply(cellcounts, time, mean),
                 lens = tapply(cellcounts, time, length))) |>
 transform(means = rollapply(cbind(means, lens), 3, wm, partial = TRUE,
   by.column = FALSE)) |>
 with(rep.int(means, lens))

4) wide form This adds sequence numbers, seqno, to each consecutive equal time in a run (1 for first row in a run, 2 for the next and so on) and then uses read.zoo to convert to wide form. We then use rollapply on that producing a condensed form of the answer and as above use rep.int to expand that out to the final result.

means4 <- df |>
  transform(seqno = ave(time, time, FUN = seq_along)) |>
  read.zoo(index = "time", split = "seqno") |>
  rollapply(3, mean, na.rm = TRUE, partial = TRUE, by.column = FALSE) |>
  rep.int(rle(df$time)$lengths)

Checking

identical(means, means2)
## [1] TRUE

identical(means, means3)
## [1] TRUE

identical(means, means4)
## [1] TRUE

Thus all four give the same answer which is

means
##  [1]  49.50000  49.50000  79.00000  79.00000 108.28571 108.28571 108.28571
##  [8] 104.33333 104.33333  98.50000  98.50000  98.50000  98.50000  99.16667
## [15]  99.16667  99.16667  99.16667 105.33333 105.33333 105.33333 105.33333
## [22] 106.36364 106.36364 106.36364 106.36364 114.45455 114.45455 114.45455
## [29] 124.70000 124.70000 124.70000 124.70000 147.11111 147.11111 147.11111
## [36] 140.14286 140.14286 120.00000 120.00000  63.66667  63.66667  54.00000
## [43]  54.00000
2
NicChr On

Using dplyr and slider

We use consecutive_id to create an increasing integer ID, ensuring that adjacent time points are always separated by a distance of 1.

We then pass that ID to slide_index_mean which takes duplicate time points into account by calculating means for the shared time intervals.

library(dplyr)
library(slider)
df %>%
  mutate(time_id = consecutive_id(time)) %>%
  mutate(mean = slide_index_mean(cellcounts, i = time_id, before = 1, after = 1)) %>%
  distinct(time_id, .keep_all = TRUE) %>%
  select(time, mean)

   time    mean
1  -3.9  49.500
2  -3.8  79.000
3  -3.7 108.286
4  -3.6 104.333
5  -3.5  98.500
6  -3.4  99.167
7  -3.3 105.333
8  -3.2 106.364
9  -3.1 114.455
10 -3.0 124.700
11 -2.9 147.111
12 -2.8 140.143
13 -2.7 120.000
14 -2.6  63.667
15 -2.5  54.000