Remove 'duplicate' rows based on combinations in two columns

89 Views Asked by At

I have this example data.frame:

df1 <- data.frame(v1 = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'),
                  v2 = c('B', 'A', 'D', 'C', 'F', 'E', 'H', 'G'),
                  value = c(1.12, 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52))
> df1
  v1 v2 value
1  A  B  1.12
2  B  A  1.12
3  C  D 12.52
4  D  C 12.52
5  E  F  3.19
6  F  E  3.19
7  G  H 12.52
8  H  G 12.52

Combinations such as A and B in row 1 are the same to me as combinations such as B and A, where values in column value are also the same. How can I remove rows which for my purpose are duplicates?

Expected result:

df2 <- data.frame(v1 = c('A', 'C', 'E', 'G'),
                  v2 = c('B', 'D', 'F', 'H'),
                  value = c(1.12, 12.52, 3.19, 12.52))

> df2
  v1 v2 value
1  A  B  1.12
2  C  D 12.52
3  E  F  3.19
4  G  H 12.52
3

There are 3 best solutions below

1
Ali On BEST ANSWER

The idea is to consider v1 and v2 interchangeable.

df1 <- data.frame(v1 = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'),
                  v2 = c('B', 'A', 'D', 'C', 'F', 'E', 'H', 'G'),
                  value = c(1.12, 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52))

### with tidyverse:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)

df2 <- df1 %>%
        mutate(combination = pmap_chr(list(v1, v2), ~ paste(sort(c(..1, ..2)), collapse = ","))) %>%
        filter(!duplicated(combination)) %>%
        select(-combination)

df2
#>   v1 v2 value
#> 1  A  B  1.12
#> 2  C  D 12.52
#> 3  E  F  3.19
#> 4  G  H 12.52

### Base R:
df2 <- df1[!duplicated(t(apply(df1[, c("v1", "v2")], 1, sort))), ]

df2
#>   v1 v2 value
#> 1  A  B  1.12
#> 3  C  D 12.52
#> 5  E  F  3.19
#> 7  G  H 12.52

Created on 2023-12-24 with reprex v2.0.2

0
jay.sf On

You can concatenate element-wise min and max of v1 and v2, and see if they are duplicated.

> df1 <- structure(list(v1 = c("A", "B", "C", "D", "E", "F", "G", "H"), 
+                       v2 = c("B", "A", "D", "C", "F", "E", "H", "G"), value = c(1.12, 
+                                                                                 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52)), class = "data.frame", row.names = c(NA, 
+                                                                                                                                                                     -8L))
> df1[!with(df1, mapply(\(...) c(min(...), max(...)), v1, v2)) |> t() |> duplicated(), ]
  v1 v2 value
1  A  B  1.12
3  C  D 12.52
5  E  F  3.19
7  G  H 12.52

Benchmark

It's pretty fast.

> dfB <- df1[sample.int(nrow(df1), 1e4, replace=TRUE), ]
> library(dplyr); library(purrr)
> microbenchmark::microbenchmark(
+   mapply=dfB[!with(dfB, mapply(\(...) c(min(...), max(...)), v1, v2)) |> t() |> duplicated(), ],
+   apply=dfB[!duplicated(t(apply(dfB[, c("v1", "v2")], 1, sort))), ],
+   tidy=dfB %>%
+     mutate(combination = pmap_chr(list(v1, v2), ~ paste(sort(c(..1, ..2)), collapse = ","))) %>%
+     filter(!duplicated(combination)) %>%
+     select(-combination),
+   check='equivalent'
+ )

$ Rscript --vanilla foo.R
Unit: milliseconds
   expr       min        lq      mean    median        uq      max neval cld
 mapply  69.62856  76.20623  83.45243  81.29934  89.74701 110.8039   100 a  
  apply 514.42432 559.80302 593.04269 594.35724 620.06086 728.9266   100  b 
   tidy 339.91324 368.55064 389.65171 386.97757 410.54651 485.9414   100   c

Data:

> dput(df1)
structure(list(v1 = c("A", "B", "C", "D", "E", "F", "G", "H"), 
    v2 = c("B", "A", "D", "C", "F", "E", "H", "G"), value = c(1.12, 
    1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52)), class = "data.frame", row.names = c(NA, 
-8L))
1
TarJae On

A dplyr only solution using pmax and pmin:

library(dplyr)

df1 %>% 
  distinct(pmin(v1, v2), pmax(v1, v2), value, .keep_all = TRUE) %>% 
  select(1:3)
 v1 v2 value
1  A  B  1.12
2  C  D 12.52
3  E  F  3.19
4  G  H 12.52