How to select for rows in a dataframe that have a time within 30 minutes of another dataframe

75 Views Asked by At

I have 2 datasets that looks like the following (with about 600 total entries for both) Dataframe1

name A_time measurement_A
A 6/10/22 7:22 3.4
A 6/10/22 16:15 4.4
A 6/11/22 6:15 5.5
A 6/11/22 15:44 6.6
B 5/20/22 6:30 7.6
B 5/20/22 13:21 1.2
B 5/21/22 8:43 2.4
B 5/21/22 14:35 4.4

Dataframe2

name B_time measurement_B
A 6/10/22 7:24 50
A 6/10/22 16:55 50.1
A 6/11/22 6:13 52
A 6/11/22 15:21 61
B 5/20/22 6:30 54
B 5/20/22 13:22 53
B 5/21/22 8:00 54.6
B 5/21/22 14:12 76.1

I want to combine these two dataframes by name and date, selecting for rows only where Dataframe1 and Dataframe2 are within 30 minutes of each other.

I'm hoping to get something like this: Dataframe3

name A_time measurement_A measurement_B
A 6/10/22 7:22 3.4 50
A 6/11/22 6:15 5.5 52
A 6/11/22 15:44 6.6 61
B 5/20/22 6:30 7.6 54
B 5/20/22 13:21 1.2 53
B 5/21/22 14:35 4.4 76.1

The code I am using:

library(dplyr)

dataframe3 <- dataframe1 %>%
  inner_join(dataframe2, by = "name") %>%
  filter(abs(A_time - B_time) <= as.difftime(30, units = "mins"))

But I am getting an error because the names in dataframe1 correlated to multiple rows in dataframe2. How can I get the Dataframe3 that I want?

4

There are 4 best solutions below

0
user2974951 On

If your system can handle an almost cross join, lubridate for dates

library(lubridate)

df1$A_time <- as_datetime(df1$A_time, format="%m/%d/%y %H:%M")
df2$B_time <- as_datetime(df2$B_time, format="%m/%d/%y %H:%M")

df3 <- merge(df1, df2, by="name")
df3[abs(difftime(df3$A_time, df3$B_time, units="mins")) <= 30, ]   
#    name              A_time measurement_A              B_time measurement_B
# 1     A 2022-06-10 07:22:00           3.4 2022-06-10 07:24:00          50.0
# 11    A 2022-06-11 06:15:00           5.5 2022-06-11 06:13:00          52.0
# 16    A 2022-06-11 15:44:00           6.6 2022-06-11 15:21:00          61.0
# 17    B 2022-05-20 06:30:00           7.6 2022-05-20 06:30:00          54.0
# 22    B 2022-05-20 13:21:00           1.2 2022-05-20 13:22:00          53.0
# 32    B 2022-05-21 14:35:00           4.4 2022-05-21 14:12:00          76.1
0
jay.sf On

merge, then subset, conveniently in a list.

Reduce(\(...) merge(..., all=TRUE), list(d1, d2)) |>
  subset(abs(as.integer(A_time - B_time)/60L) <= 30L, -B_time)
#    name              A_time measurement_A measurement_B
# 7     A 2022-06-11 06:15:00           5.5          52.0
# 12    A 2022-06-11 15:44:00           6.6          61.0
# 13    A 2022-06-10 07:22:00           3.4          50.0
# 18    B 2022-05-20 06:30:00           7.6          54.0
# 21    B 2022-05-20 13:21:00           1.2          53.0
# 32    B 2022-05-21 14:35:00           4.4          76.1

Data (assorted rows!):

d1 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), A_time = c("6/10/22 16:15", "6/11/22 6:15", "6/11/22 15:44", 
"6/10/22 7:22", "5/20/22 6:30", "5/20/22 13:21", "5/21/22 8:43", 
"5/21/22 14:35"), measurement_A = c(4.4, 5.5, 6.6, 3.4, 7.6, 
1.2, 2.4, 4.4)), class = "data.frame", row.names = c(NA, -8L))

d2 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), B_time = c("6/10/22 7:24", "6/10/22 16:55", "6/11/22 6:13", 
"6/11/22 15:21", "5/20/22 13:22", "5/20/22 6:30", "5/21/22 8:00", 
"5/21/22 14:12"), measurement_B = c(50, 50.1, 52, 61, 53, 54, 
54.6, 76.1)), class = "data.frame", row.names = c(NA, -8L))

d1$A_time <- strptime(d1$A_time, '%m/%d/%y %H:%M')
d2$B_time <- strptime(d2$B_time, '%m/%d/%y %H:%M')
0
Darren Tsai On

You can set relationship = "many-to-many" in inner_join() to silence the warning.

df1 %>%
  mutate(A_time = strptime(A_time, '%m/%d/%y %H:%M')) %>%
  inner_join(df2 %>% mutate(B_time = strptime(B_time, '%m/%d/%y %H:%M')),
             by = "name", relationship = "many-to-many") %>%
  filter(abs(difftime(A_time, B_time, units = "mins")) <= 30) %>%
  select(-B_time)

Another dplyr solution with overlap joins:

library(dplyr)

df1 %>%
  mutate(A_time = strptime(A_time, '%m/%d/%y %H:%M'),
         A_lower = A_time - 30*60,
         A_upper = A_time + 30*60) %>%
  inner_join(df2 %>% mutate(B_time = strptime(B_time, '%m/%d/%y %H:%M')),
             by = join_by(name, between(y$B_time, x$A_lower, x$A_upper))) %>%
  select(-c(A_lower, A_upper, B_time))

# # A tibble: 6 × 4
#   name  A_time              measurement_A measurement_B
#   <chr> <dttm>                      <dbl>         <dbl>
# 1 A     2022-06-10 07:22:00           3.4          50  
# 2 A     2022-06-11 06:15:00           5.5          52  
# 3 A     2022-06-11 15:44:00           6.6          61  
# 4 B     2022-05-20 06:30:00           7.6          54  
# 5 B     2022-05-20 13:21:00           1.2          53  
# 6 B     2022-05-21 14:35:00           4.4          76.1

Data
df1 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), A_time = c("6/10/22 7:22", "6/10/22 16:15", "6/11/22 6:15", 
"6/11/22 15:44", "5/20/22 6:30", "5/20/22 13:21", "5/21/22 8:43", 
"5/21/22 14:35"), measurement_A = c(3.4, 4.4, 5.5, 6.6, 7.6, 
1.2, 2.4, 4.4)), class = "data.frame", row.names = c(NA, -8L))

df2 <- structure(list(name = c("A", "A", "A", "A", "B", "B", "B", "B"
), B_time = c("6/10/22 7:24", "6/10/22 16:55", "6/11/22 6:13", 
"6/11/22 15:21", "5/20/22 6:30", "5/20/22 13:22", "5/21/22 8:00", 
"5/21/22 14:12"), measurement_B = c(50, 50.1, 52, 61, 54, 53, 
54.6, 76.1)), class = "data.frame", row.names = c(NA, -8L))
0
ThomasIsCoding On

You can use merge with all = TRUE + subset

subset(
    merge(df1, df2, all = TRUE),
    abs(difftime(
        strptime(A_time, "%m/%d/%y %H:%M"),
        strptime(B_time, "%m/%d/%y %H:%M"),
        units = "mins"
    )) <= 30,
    select = -B_time
)

which gives

   name        A_time measurement_A measurement_B
7     A  6/11/22 6:15           5.5          52.0
12    A 6/11/22 15:44           6.6          61.0
13    A  6/10/22 7:22           3.4          50.0
18    B  5/20/22 6:30           7.6          54.0
21    B 5/20/22 13:21           1.2          53.0
32    B 5/21/22 14:35           4.4          76.1