Efficiently enumerate all subsets with difference constraints in R

192 Views Asked by At

I have a vector V of consecutive integers of length l , e.g., 1, 2, 3, 4, 5, 6, 7. I want to find all subsets of size k such that the difference between any two numbers in the subset can be no less than m, e.g., 2. Using the example above with l = 7, k = 3, and m = 2, the subsets are

1, 3, 5
1, 3, 6
1, 3, 7
1, 4, 6
1, 4, 7
1, 5, 7
2, 4, 6
2, 4, 7
2, 5, 7
3, 5, 7

One approach is to enumerate all possible subsets of size k and discard any that fail to meet the m constraint, but this procedure explodes even when the number of solutions is small.

My current approach is a brute-force algorithm in which I start from the subset with the lowest possible integer and increase the last entry by 1 until the upper limit is reached, increment the previous entry and reset the last entry to the lowest it can be given the increase in the previous entry. That is, I start with 1, 3, 5, then increase the last digit by one to get 1, 3, 6 and 1, 3, 7, and then since the upper limit is reached I increment the middle by 1 (to 4) and set the last digit to the smallest it can be given that digit (to 6) to get 1, 4, 6, and carry on as such. This ends up being quite slow in R for large l, and I'm wondering if there is a clever vectorized solution that can instantly generate all the combinations, possible by capitalizing on the sequential nature of the entries. Implementing this algorithm in Rcpp speeds this up a bit, but I'm still hoping for a more elegant solution if one is available.

3

There are 3 best solutions below

5
ThomasIsCoding On BEST ANSWER

Here are several base R options

  • Recursion

We can define recursive function like below

f0 <- function(v, k, m) {
    if (k == 1) {
        return(matrix(v))
    }
    d <- Recall(v, k - 1, m)
    u <- unique(d[, ncol(d)])
    uu <- (u + m)[(u + m) %in% v]
    lst <- list()
    for (i in u) {
        dd <- d[d[, ncol(d)] == i, , drop = FALSE]
        p <- uu[uu - i >= m]
        if (length(p) > 0) {
            lst <- append(
                lst,
                list(cbind(dd[rep(1:nrow(dd), each = length(p)), , drop = FALSE], p))
            )
        }
    }
    unname(do.call(rbind, lst))
}

and we can obtain

> f0(v = 1:7, k = 3, m = 2)
      [,1] [,2] [,3]
 [1,]    1    3    5
 [2,]    1    3    6
 [3,]    1    3    7
 [4,]    1    4    6
 [5,]    1    4    7
 [6,]    2    4    6
 [7,]    2    4    7
 [8,]    1    5    7
 [9,]    2    5    7
[10,]    3    5    7

> f0(v = 1:10, k = 3, m = 2)
      [,1] [,2] [,3]
 [1,]    1    3    5
 [2,]    1    3    6
 [3,]    1    3    7
 [4,]    1    3    8
 [5,]    1    3    9
 [6,]    1    3   10
 [7,]    1    4    6
 [8,]    1    4    7
 [9,]    1    4    8
[10,]    1    4    9
[11,]    1    4   10
[12,]    2    4    6
[13,]    2    4    7
[14,]    2    4    8
[15,]    2    4    9
[16,]    2    4   10
[17,]    1    5    7
[18,]    1    5    8
[19,]    1    5    9
[20,]    1    5   10
[21,]    2    5    7
[22,]    2    5    8
[23,]    2    5    9
[24,]    2    5   10
[25,]    3    5    7
[26,]    3    5    8
[27,]    3    5    9
[28,]    3    5   10
[29,]    1    6    8
[30,]    1    6    9
[31,]    1    6   10
[32,]    2    6    8
[33,]    2    6    9
[34,]    2    6   10
[35,]    3    6    8
[36,]    3    6    9
[37,]    3    6   10
[38,]    4    6    8
[39,]    4    6    9
[40,]    4    6   10
[41,]    1    7    9
[42,]    1    7   10
[43,]    2    7    9
[44,]    2    7   10
[45,]    3    7    9
[46,]    3    7   10
[47,]    4    7    9
[48,]    4    7   10
[49,]    5    7    9
[50,]    5    7   10
[51,]    1    8   10
[52,]    2    8   10
[53,]    3    8   10
[54,]    4    8   10
[55,]    5    8   10
[56,]    6    8   10
  • for-loops approach

You can simply run with for loops like below

f1 <- function(v, k, m) {
    res <- matrix(v)
    p <- v
    for (i in 1:(k - 1)) {
        q <- (p + m)[(p + m) %in% v]
        lst <- list()
        for (j in 1:nrow(res)) {
            s <- q[q - res[j, i] >= m]
            if (length(s) > 0) {
                lst <- append(
                    lst,
                    list(cbind(res[rep(j, each = length(s)), , drop = FALSE], s))
                )
            }
        }
        res <- unname(do.call(rbind, lst))
        p <- q
    }
    res
}

and will obtain

> f1(v = 1:7, k = 3, m = 2)
      [,1] [,2] [,3]
 [1,]    1    3    5
 [2,]    1    3    6
 [3,]    1    3    7
 [4,]    1    4    6
 [5,]    1    4    7
 [6,]    2    4    6
 [7,]    2    4    7
 [8,]    1    5    7
 [9,]    2    5    7
[10,]    3    5    7

> f1(v = 1:10, k = 3, m = 2)
      [,1] [,2] [,3]
 [1,]    1    3    5
 [2,]    1    3    6
 [3,]    1    3    7
 [4,]    1    3    8
 [5,]    1    3    9
 [6,]    1    3   10
 [7,]    1    4    6
 [8,]    1    4    7
 [9,]    1    4    8
[10,]    1    4    9
[11,]    1    4   10
[12,]    2    4    6
[13,]    2    4    7
[14,]    2    4    8
[15,]    2    4    9
[16,]    2    4   10
[17,]    1    5    7
[18,]    1    5    8
[19,]    1    5    9
[20,]    1    5   10
[21,]    2    5    7
[22,]    2    5    8
[23,]    2    5    9
[24,]    2    5   10
[25,]    3    5    7
[26,]    3    5    8
[27,]    3    5    9
[28,]    3    5   10
[29,]    1    6    8
[30,]    1    6    9
[31,]    1    6   10
[32,]    2    6    8
[33,]    2    6    9
[34,]    2    6   10
[35,]    3    6    8
[36,]    3    6    9
[37,]    3    6   10
[38,]    4    6    8
[39,]    4    6    9
[40,]    4    6   10
[41,]    1    7    9
[42,]    1    7   10
[43,]    2    7    9
[44,]    2    7   10
[45,]    3    7    9
[46,]    3    7   10
[47,]    4    7    9
[48,]    4    7   10
[49,]    5    7    9
[50,]    5    7   10
[51,]    1    8   10
[52,]    2    8   10
[53,]    3    8   10
[54,]    4    8   10
[55,]    5    8   10
[56,]    6    8   10
  • Reduce approach

Another option is using Reduce, which iteratively increases the dimension of the resulting data.frame by adding eligible elements from v as a new column.

f2 <- function(v, k, m) {
    helper <- function(df, v) {
        u <- unique(df[[length(df)]])
        v <- (u + m)[(u + m) %in% v]
        grp <- split(df, df[length(df)])
        lst <- lapply(
            grp,
            \(x) {
                p <- v[v - x[[length(x)]][1] >= 2]
                if (length(p) > 0) {
                    cbind(x[rep(1:nrow(x), each = length(p)), ], p)
                }
            }
        )
        as.data.frame(`row.names<-`(unname(do.call(rbind, lst)), NULL))
    }
    Reduce(helper, rep(list(v), k - 1), init = as.data.frame(v))
}

and you will obtain

> f2(v = 1:7, k = 3, m = 2)
        
1  1 3 5
2  1 3 6
3  1 3 7
4  1 4 6
5  1 4 7
6  2 4 6
7  2 4 7
8  1 5 7
9  2 5 7
10 3 5 7

> f2(v = 1:10, k = 3, m = 2)

1  1 3  5
2  1 3  6
3  1 3  7
4  1 3  8
5  1 3  9
6  1 3 10
7  1 4  6
8  1 4  7
9  1 4  8
10 1 4  9
11 1 4 10
12 2 4  6
13 2 4  7
14 2 4  8
15 2 4  9
16 2 4 10
17 1 5  7
18 1 5  8
19 1 5  9
20 1 5 10
21 2 5  7
22 2 5  8
23 2 5  9
24 2 5 10
25 3 5  7
26 3 5  8
27 3 5  9
28 3 5 10
29 1 6  8
30 1 6  9
31 1 6 10
32 2 6  8
33 2 6  9
34 2 6 10
35 3 6  8
36 3 6  9
37 3 6 10
38 4 6  8
39 4 6  9
40 4 6 10
41 1 7  9
42 1 7 10
43 2 7  9
44 2 7 10
45 3 7  9
46 3 7 10
47 4 7  9
48 4 7 10
49 5 7  9
50 5 7 10
51 1 8 10
52 2 8 10
53 3 8 10
54 4 8 10
55 5 8 10
56 6 8 10

Benchmarking

The benchmark includes all existing answers to this question

v <- 1:20
k <- 4
m <- 2
microbenchmark(
    f_AC = f(v, k, m),    # Allan Cameron's solution
    f_TIC0 = f0(v, k, m), # ThomasIsCoding's solution 0
    f_TIC1 = f1(v, k, m), # ThomasIsCoding's solution 1
    f_TIC2 = f2(v, k, m), # ThomasIsCoding's solution 2
    times = 20L,
    unit = "relative"
)

and we see

Unit: relative
   expr       min       lq     mean    median        uq      max neval
   f_AC 82.677137 69.78411 43.01843 83.497758 81.922518 6.791794    20
 f_TIC0  1.000000  1.00000  1.00000  1.000000  1.000000 1.000000    20
 f_TIC1  7.731772  7.74679  4.75455  8.012004  7.592652 1.316215    20
 f_TIC2 19.764325 16.68923 11.65485 17.963988 24.911318 2.359821    20

For a pressure test with v <- 1:100, k <- 5 and m <- 2, the performance of recursions f (by @Allan Cameron) and f0 (by @ThomasIsCoding) are shown as below

> system.time(
+     res <- f0(v = 1:100, k = 5, m = 2)
+ )
   user  system elapsed 
   3.33    1.99    5.39

> system.time(
+     res <- f(v = 1:100, k = 5, m = 2)
+ )
   user  system elapsed 
 146.70    4.17  157.02
1
Allan Cameron On

A reasonable approach might be to build the vectors recursively with the given constraints:

f <- function(v, k, m, existing = numeric()) {
  new_vals <- min(v):(max(v) - ((k - 1) * m))
  updated <- lapply(new_vals, function(x) c(existing, x))
  if(k == 1) return(updated)
  purrr::list_flatten(lapply(updated, function(x) {
    if((max(x) + m) == max(v)) return(c(x, max(v)))
    if(max(x) + m > max(v)) return(x)
    f(v[v >= (max(x) + m)], k - 1, m, x)
  }))
}

This gives us the same list for the given start parameters:

f(v = 1:7, k = 3, m = 2)
#> [[1]]
#> [1] 1 3 5
#> 
#> [[2]]
#> [1] 1 3 6
#> 
#> [[3]]
#> [1] 1 3 7
#> 
#> [[4]]
#> [1] 1 4 6
#> 
#> [[5]]
#> [1] 1 4 7
#> 
#> [[6]]
#> [1] 1 5 7
#> 
#> [[7]]
#> [1] 2 4 6
#> 
#> [[8]]
#> [1] 2 4 7
#> 
#> [[9]]
#> [1] 2 5 7
#> 
#> [[10]]
#> [1] 3 5 7

And correctly finds the only set with k = 4 and m = 3 in the vector 1:10

f(v = 1:10, k = 4, m = 3)
#> [[1]]
#> [1]  1  4  7 10

Or the large set of k = 3, m = 2 in 1:10

f(v = 1:10, k = 3, m = 2)
#> [[1]]
#> [1] 1 3 5
#> 
#> [[2]]
#> [1] 1 3 6
#> 
#> [[3]]
#> [1] 1 3 7
#> 
#> [[4]]
#> [1] 1 3 8
#> 
#> [[5]]
#> [1] 1 3 9
#> 
#> [[6]]
#> [1]  1  3 10
#> 
#> [[7]]
#> [1] 1 4 6
#> 
#> [[8]]
#> [1] 1 4 7
#> 
#> [[9]]
#> [1] 1 4 8
#> 
#> [[10]]
#> [1] 1 4 9
#> 
#> [[11]]
#> [1]  1  4 10
#> 
#> [[12]]
#> [1] 1 5 7
#> 
#> [[13]]
#> [1] 1 5 8
#> 
#> [[14]]
#> [1] 1 5 9
#> 
#> [[15]]
#> [1]  1  5 10
#> 
#> [[16]]
#> [1] 1 6 8
#> 
#> [[17]]
#> [1] 1 6 9
#> 
#> [[18]]
#> [1]  1  6 10
#> 
#> [[19]]
#> [1] 1 7 9
#> 
#> [[20]]
#> [1]  1  7 10
#> 
#> [[21]]
#> [1]  1  8 10
#> 
#> [[22]]
#> [1] 2 4 6
#> 
#> [[23]]
#> [1] 2 4 7
#> 
#> [[24]]
#> [1] 2 4 8
#> 
#> [[25]]
#> [1] 2 4 9
#> 
#> [[26]]
#> [1]  2  4 10
#> 
#> [[27]]
#> [1] 2 5 7
#> 
#> [[28]]
#> [1] 2 5 8
#> 
#> [[29]]
#> [1] 2 5 9
#> 
#> [[30]]
#> [1]  2  5 10
#> 
#> [[31]]
#> [1] 2 6 8
#> 
#> [[32]]
#> [1] 2 6 9
#> 
#> [[33]]
#> [1]  2  6 10
#> 
#> [[34]]
#> [1] 2 7 9
#> 
#> [[35]]
#> [1]  2  7 10
#> 
#> [[36]]
#> [1]  2  8 10
#> 
#> [[37]]
#> [1] 3 5 7
#> 
#> [[38]]
#> [1] 3 5 8
#> 
#> [[39]]
#> [1] 3 5 9
#> 
#> [[40]]
#> [1]  3  5 10
#> 
#> [[41]]
#> [1] 3 6 8
#> 
#> [[42]]
#> [1] 3 6 9
#> 
#> [[43]]
#> [1]  3  6 10
#> 
#> [[44]]
#> [1] 3 7 9
#> 
#> [[45]]
#> [1]  3  7 10
#> 
#> [[46]]
#> [1]  3  8 10
#> 
#> [[47]]
#> [1] 4 6 8
#> 
#> [[48]]
#> [1] 4 6 9
#> 
#> [[49]]
#> [1]  4  6 10
#> 
#> [[50]]
#> [1] 4 7 9
#> 
#> [[51]]
#> [1]  4  7 10
#> 
#> [[52]]
#> [1]  4  8 10
#> 
#> [[53]]
#> [1] 5 7 9
#> 
#> [[54]]
#> [1]  5  7 10
#> 
#> [[55]]
#> [1]  5  8 10
#> 
#> [[56]]
#> [1]  6  8 10

You will still run up against combinatorial explosion if you try to select large values of k in a large vector, but this solution is surprisingly fast - it can generate all 152,000 possible length-3 vectors where m = 2 between 1 and 100 in less than half a second. With k = 4, there are almost 3.5 million possible solutions, so it takes a few seconds.

I wouldn't attempt k = 5 for a vector that large...

Created on 2023-08-24 with reprex v2.0.2

0
Joseph Wood On

To add to the excellent answers from @AllanCameron and @ThomasIsCoding, here is a simple algorithm for obtaining the next lexicographic constrained combination:

next_combo <- function(idx, n, k, m, last, check) {

    if (idx[1L] == last) {
        return(NULL)
    }

    if (idx[k] < n) {
        idx[k] <- idx[k] + 1L
    } else {
        for (i in (k - 1L):1) {
            if (idx[i] != (check + (i - 1L) * m)) {
                idx[i:k] <- seq(
                    idx[i] + 1L, length.out = (k - i + 1L), by = m
                )

                break
            }
        }
    }

    return(idx)
}

And calling it, we have:

f3 <- function(v, k, m) {

    k <- as.integer(k)
    m <- as.integer(m)
    n <- length(v)

    count <- 1L
    res   <- list()
    idx   <- seq(1L, by = m, length.out = k)
    last  <- n - (m * (k - 1L))
    check <- n - ((k - 1L) * m)

    while (!is.null(idx)) {
        res[[count]] <- v[idx]
        count <- count + 1L
        idx <- next_combo(idx, n, k, m, last, check)
    }

    return(res)
}

f3(1:7, 3, 2)
#> [[1]]
#> [1] 1 3 5
#> 
#> [[2]]
#> [1] 1 3 6
#> 
#> [[3]]
#> [1] 1 3 7
#> 
#> [[4]]
#> [1] 1 4 6
#> 
#> [[5]]
#> [1] 1 4 7
#> 
#> [[6]]
#> [1] 1 5 7
#> 
#> [[7]]
#> [1] 2 4 6
#> 
#> [[8]]
#> [1] 2 4 7
#> 
#> [[9]]
#> [1] 2 5 7
#> 
#> [[10]]
#> [1] 3 5 7

It is decently efficient and could easily be translated to compiled code for massive speed gains:

v <- 1:20
k <- 4
m <- 2
microbenchmark(
    f_AC = f(v, k, m),    # Allan Cameron's solution
    f_TIC0 = f0(v, k, m), # ThomasIsCoding's solution 0
    f_TIC1 = f1(v, k, m), # ThomasIsCoding's solution 1
    f_TIC2 = f2(v, k, m), # ThomasIsCoding's solution 2
    f_JW   = f3(v, k, m), # Joseph Wood's solution
    times = 20L,
    unit = "relative"
)
#> Unit: relative
#>    expr       min        lq      mean   median        uq      max neval  cld
#>    f_AC 56.664749 60.773351 33.370015 58.73102 53.695930 4.228782    20 a
#>  f_TIC0  1.000000  1.000000  1.000000  1.00000  1.000000 1.000000    20  b
#>  f_TIC1  7.535737  7.888895  4.737836  7.70793  7.187051 1.215956    20   c
#>  f_TIC2 15.642373 14.742394  9.383656 14.71615 13.245660 2.253755    20    d
#>    f_JW 15.105741 16.196579  8.831067 15.68912 14.210744 1.066013    20    d