I have a dataframe containing intervals coming from different sources (A, B, C). I would like to compute pairewise overlap percentage between the interval of each sources. I wrote these following command lines. I was wondering if there is an easier/faster way to do that (my data are bigger than this example) and a way to make this more adaptable if I have a fourth source to include.
data=data.frame(StartA=c(134000,765888,243634,576098,398776,128598,NA), StopA=c(181654,842465, 244377, 582626, 399102, 129893, NA),
StartB=c(134023,765880,243634,576098,NA,128598,849356), StopB=c(181654,842465, 244352, 582626, NA, 129893, 868654),
StartC=c(132065,NA,NA,592626,398776,128698,867656), StopC=c(191604,NA, NA, 593391, 399102, 129993, 868654))
data
StartA StopA StartB StopB StartC StopC
1 134000 181654 134023 181654 132065 191604
2 765888 842465 765880 842465 NA NA
3 243634 244377 243634 244352 NA NA
4 576098 582626 576098 582626 592626 593391
5 398776 399102 NA NA 398776 399102
6 128598 129893 128598 129893 128698 129993
7 NA NA 849356 868654 867656 868654
# For loop to compute each overlap for each row
#################################################
data$overlap_reciproq_AB=NA
data$overlap_reciproq_BC=NA
data$overlap_reciproq_AC=NA
for (i in 1:nrow(data)) {
# overlap A vs B
if(!is.na(data$StartA[i]) & !is.na(data$StopA[i]) & !is.na(data$StartB[i]) & !is.na(data$StopB[i])) {
overlapAB = max(0, as.numeric(min(data$StopA[i],data$StopB[i]))-as.numeric(max(data$StartA[i], data$StartB[i])))
overlap_A_B = overlapAB / (data$StopA[i] - data$StartA[i]) *100
overlap_B_A = overlapAB / (data$StopB[i] - data$StartB[i]) *100
data$overlap_reciproq_AB[i] = min(overlap_A_B, overlap_B_A)
}
# overlap A vs C
if(!is.na(data$StartA[i]) & !is.na(data$StopA[i]) & !is.na(data$StartC[i]) & !is.na(data$StopC[i])) {
overlapAC = max(0, as.numeric(min(data$StopA[i],data$StopC[i]))-as.numeric(max(data$StartA[i], data$StartC[i])))
overlap_A_C = overlapAC / (data$StopA[i] - data$StartA[i]) *100
overlap_C_A = overlapAC / (data$StopC[i] - data$StartC[i]) *100
data$overlap_reciproq_AC[i] = min(overlap_A_C, overlap_C_A)
}
# overlap B vs C
if(!is.na(data$StartC[i]) & !is.na(data$StopC[i]) & !is.na(data$StartB[i]) & !is.na(data$StopB[i])) {
overlapBC = max(0, as.numeric(min(data$StopB[i],data$StopC[i]))-as.numeric(max(data$StartB[i], data$StartC[i])))
overlap_B_C = overlapBC / (data$StopB[i] - data$StartB[i]) *100
overlap_C_B = overlapBC / (data$StopC[i] - data$StartC[i]) *100
data$overlap_reciproq_BC[i] = min(overlap_B_C, overlap_C_B)
}
}
# Output
#############################
StartA StopA StartB StopB StartC StopC overlap_reciproq_AB overlap_reciproq_BC overlap_reciproq_AC
1 134000 181654 134023 181654 132065 191604 99.95174 79.99966 79.99966
2 765888 842465 765880 842465 NA NA 99.98955 NA NA
3 243634 244377 243634 244352 NA NA 96.63526 NA NA
4 576098 582626 576098 582626 592626 593391 100.00000 0.00000 0.00000
5 398776 399102 NA NA 398776 399102 NA NA 100.00000
6 128598 129893 128598 129893 128698 129993 100.00000 92.27799 92.27799
7 NA NA 849356 868654 867656 868654 NA 5.17152 NA
One approach which should scale to larger data sets and more sources uses
foverlapsfrom thedata.tablepackage. Note the ordering of the overlap columns is alphabetical in the final output.