Assitance optimizing a nested loop?

76 Views Asked by At

I am just starting out learning to code, and I am working with the R programming language. I am working with a data frame that looks like this:

> head(dat)
  Chromosome Start_Position End_Position Match_Position Match_End_Position
1          1              1           25          11386              11410
2          1             26           50          11411              11435
3          1             51           75          10218              10242
4          1             51           75          10827              10851
5          1             51           75          11436              11460
6          1             76          100            686                710

I need to condense the contiguous matches (disregarding any singular 25 bp match). The matches must be contiguous according to the End_Position and Match_End_Position. Given the data shown above, the example output would look like this:

   Chromosome Start_Position End_Position Match_Position Match_End_Position
1           1              1           75          11386              11460

I have managed to perform this condensing using a nested loop:

res <- as.data.frame(matrix(NA, 0, 5))
colnames(res) <- colnames(dat)
curset <- c()

processing <- TRUE

i <- 1

while(processing){
  curset <- i
  hit <- which(dat$Start_Position == (dat$End_Position[i] + 1))
  hit <- hit[which(dat$Match_Position[hit] == dat$Match_End_Position[i] + 1)]

  if(length(hit) > 0){
    curset <- c(curset, hit)
    searching <- TRUE

    while(searching){
      cur.row <- hit
      hit <- which(dat$Start_Position == (dat$End_Position[cur.row] + 1))
      hit <- hit[which(dat$Match_Position[hit] == dat$Match_End_Position[cur.row] + 1)]
  
      if(length(hit) > 0){
        curset <- c(curset, hit)
      } else {

        searching <- FALSE
      }
    }
 }

  if(length(curset) > 1){
     res[nrow(res) + 1, ] <- c(dat$Chromosome[curset[1]], 
                              dat$Start_Position[curset[1]],
                              dat$End_Position[curset[length(curset)]],
                              dat$Match_Position[curset[1]],
                              dat$Match_End_Position[curset[length(curset)]])
  }

  i <- curset[length(curset)] + 1

  if(i >= nrow(dat)){
    processing <- FALSE
  }
}

However, I am working with large datasets, and this loop is pretty slow on larger data frames.I tried using a data table to be more efficient, but the results I got were not the same. Here is what I tried:

library(data.table)
dat <- as.data.table(dat)

dat[, group_id := .GRP, by = .(Chromosome, cumsum(Start_Position - shift(End_Position, fill =   Start_Position[1]) > 1))]

condensed <- dat[, .(Start_Position = min(Start_Position), 
                     End_Position = max(End_Position), 
                     Match_Position = min(Match_Position), 
                     Match_End_Position = max(Match_End_Position)), 
                 by = .(Chromosome, group_id)]

condensed[, group_id := NULL]
0

There are 0 best solutions below