Remove 'duplicate' rows based on combinations in two columns

Question

Remove 'duplicate' rows based on combinations in two columns

89 Views Asked by Sylvia Rodriguez At 23 December 2023 at 21:34

I have this example data.frame:

df1 <- data.frame(v1 = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'),
                  v2 = c('B', 'A', 'D', 'C', 'F', 'E', 'H', 'G'),
                  value = c(1.12, 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52))
> df1
  v1 v2 value
1  A  B  1.12
2  B  A  1.12
3  C  D 12.52
4  D  C 12.52
5  E  F  3.19
6  F  E  3.19
7  G  H 12.52
8  H  G 12.52

Combinations such as A and B in row 1 are the same to me as combinations such as B and A, where values in column value are also the same. How can I remove rows which for my purpose are duplicates?

Expected result:

df2 <- data.frame(v1 = c('A', 'C', 'E', 'G'),
                  v2 = c('B', 'D', 'F', 'H'),
                  value = c(1.12, 12.52, 3.19, 12.52))

> df2
  v1 v2 value
1  A  B  1.12
2  C  D 12.52
3  E  F  3.19
4  G  H 12.52

Original Q&A

There are 3 best solutions below

jay.sf On 24 December 2023 at 05:03

You can concatenate element-wise min and max of v1 and v2, and see if they are duplicated.

> df1 <- structure(list(v1 = c("A", "B", "C", "D", "E", "F", "G", "H"), 
+                       v2 = c("B", "A", "D", "C", "F", "E", "H", "G"), value = c(1.12, 
+                                                                                 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52)), class = "data.frame", row.names = c(NA, 
+                                                                                                                                                                     -8L))
> df1[!with(df1, mapply(\(...) c(min(...), max(...)), v1, v2)) |> t() |> duplicated(), ]
  v1 v2 value
1  A  B  1.12
3  C  D 12.52
5  E  F  3.19
7  G  H 12.52

Benchmark

It's pretty fast.

> dfB <- df1[sample.int(nrow(df1), 1e4, replace=TRUE), ]
> library(dplyr); library(purrr)
> microbenchmark::microbenchmark(
+   mapply=dfB[!with(dfB, mapply(\(...) c(min(...), max(...)), v1, v2)) |> t() |> duplicated(), ],
+   apply=dfB[!duplicated(t(apply(dfB[, c("v1", "v2")], 1, sort))), ],
+   tidy=dfB %>%
+     mutate(combination = pmap_chr(list(v1, v2), ~ paste(sort(c(..1, ..2)), collapse = ","))) %>%
+     filter(!duplicated(combination)) %>%
+     select(-combination),
+   check='equivalent'
+ )

$ Rscript --vanilla foo.R
Unit: milliseconds
   expr       min        lq      mean    median        uq      max neval cld
 mapply  69.62856  76.20623  83.45243  81.29934  89.74701 110.8039   100 a  
  apply 514.42432 559.80302 593.04269 594.35724 620.06086 728.9266   100  b 
   tidy 339.91324 368.55064 389.65171 386.97757 410.54651 485.9414   100   c

Data:

> dput(df1)
structure(list(v1 = c("A", "B", "C", "D", "E", "F", "G", "H"), 
    v2 = c("B", "A", "D", "C", "F", "E", "H", "G"), value = c(1.12, 
    1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52)), class = "data.frame", row.names = c(NA, 
-8L))

TarJae On 24 December 2023 at 08:56

A dplyr only solution using pmax and pmin:

library(dplyr)

df1 %>% 
  distinct(pmin(v1, v2), pmax(v1, v2), value, .keep_all = TRUE) %>% 
  select(1:3)

 v1 v2 value
1  A  B  1.12
2  C  D 12.52
3  E  F  3.19
4  G  H 12.52

**Ali** · Accepted Answer · 2023-12-23T21:49:09.027000

The idea is to consider v1 and v2 interchangeable.

df1 <- data.frame(v1 = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'),
                  v2 = c('B', 'A', 'D', 'C', 'F', 'E', 'H', 'G'),
                  value = c(1.12, 1.12, 12.52, 12.52, 3.19, 3.19, 12.52, 12.52))

### with tidyverse:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)

df2 <- df1 %>%
        mutate(combination = pmap_chr(list(v1, v2), ~ paste(sort(c(..1, ..2)), collapse = ","))) %>%
        filter(!duplicated(combination)) %>%
        select(-combination)

df2
#>   v1 v2 value
#> 1  A  B  1.12
#> 2  C  D 12.52
#> 3  E  F  3.19
#> 4  G  H 12.52

### Base R:
df2 <- df1[!duplicated(t(apply(df1[, c("v1", "v2")], 1, sort))), ]

df2
#>   v1 v2 value
#> 1  A  B  1.12
#> 3  C  D 12.52
#> 5  E  F  3.19
#> 7  G  H 12.52

^{Created on 2023-12-24 with reprex v2.0.2}

Remove 'duplicate' rows based on combinations in two columns

There are 3 best solutions below

Benchmark

Related Questions in R

Related Questions in DUPLICATES

Related Questions in ROW

Related Questions in MULTIPLE-COLUMNS

Related Questions in DROP-DUPLICATES

Trending Questions

Popular # Hahtags

Popular Questions