Raster merge using doParallel creates temp file for each merge step

319 Views Asked by At

I am working with the raster and glcm packages to compute Haralick texture features on satellite imagery. I have successfully run the glcm() function using a single core but am working on running it in parallel. Here is the code I'm using:

# tiles is a list of raster extents, r is a raster

registerDoParallel(7)

out_raster = foreach(i=1:length(tiles),.combine = merge,.packages=c("raster","glcm")) %dopar% 

    glcm(crop(r,tiles[[i]]), n_grey=16, window=c(17,17), shift = c(1,1),
             min_x = rmin, max_x = rmax)

When I examine the temp files that are created, it appears each step of the merge creates a temp file, which takes a lot of hard drive space. Here is the overall image (2GB):

Full raster

and here are two of the temp files: Merge Step 1 Merge Step 2

Because the glcm function output for each tile is 3 GB, creating a temp file for each stepwise merge operation creates ~160GB of temp raster files. Is there a more space efficient way to run this in parallel?

1

There are 1 best solutions below

0
Andy On

I managed to save hard drive space by using gdal and building vrts. Below is the code I wrote running on the example data from the glcm package. The steps were 1: Create vrt files of the tiles; 2) Run the glcm function in parallel on each vrt tile (see glcm_parallel function); 3) Merge the tiles into a vrt and write the output raster using gdal warp. The vrt files are very small and the only temp files are just those created by the glcm function. This should help a lot with large rasters.

#Load Packages
library(raster)
library(sf)
library(rgdal)
library(glcm)
library(doParallel)
library(gdalUtils)

#Source helper functions
source("./tilebuild_buff.R")
source("./glcm_parallel.R")

#Read raster - example data from glcm package (saved to disk)
rasterfile = "./L5TSR_1986.tif"
r = raster("L5TSR_1986.tif")

#Create tiles directory if it doesn't exist and clear files if it exists
if(!file.exists("./tiles")){dir.create("./tiles")}
file.remove(list.files("./tiles/",full.names=T))

#Calculate tiles for parallel processing - returns x and y offsets, and widths
#to use with gdal_translate
jobs_buff = tilebuild_buff(r,nx=5,ny=2,buffer=c(5,5))

#Create vrt files for buffered tiles
for (i in 1:length(jobs_buff[,1])){
  fin = rasterfile
  fout = paste0("./tiles/t_",i,'.vrt')
  ex = as.numeric(jobs_buff[i,])
  gdal_utils('translate',fin,fout,options = c('-srcwin',ex,'-of','vrt'))
}

#Read in vrt files of raster tiles and set the nodata value
input.rasters = lapply(paste0("./tiles/", list.files("./tiles/",pattern="\\.vrt$")), raster)

for(i in 1:length(input.rasters)){ NAvalue(input.rasters[[i]])= -3.4E38 }

#Create a directory for temporary raster grids and clear files
tempdir = "./rastertemp/"
if(!file.exists(tempdir)){dir.create(tempdir)}
file.remove(list.files(tempdir,full.names=T))


registerDoParallel(6) 

#Determine min and max values over original raster
rmin = cellStats(r,'min')
rmax = cellStats(r,'max')

#Run glcm function in parallel
glcm_split = foreach(i=1:length(jobs_buff[,1]),.packages=c("raster","glcm")) %dopar%
  glcm_parallel(inlist = input.rasters,temp=tempdir,window=c(3,3),n_grey=16,
                min_x=rmin,max_x=rmax)

#Get list of temp raster files created by glcm function
temps = paste0(tempdir,list.files(tempdir,pattern="\\.grd$"))

#trim off buffer (from tilebuild_buff function) and create mosaic raster
mosaic_rasters(temps,dst_dataset = "./mosaic.vrt", trim_margins = 5, srcnodata=-3.4E38,overwrite=T)

#write output tif
vrt_mosaic = "./mosaic.vrt"
outtif = "./final_merged.tif"
gdalwarp(vrt_mosaic,outtif,overwrite=T,verbose=T)

The two helper functions are here:

glcm_parallel <- function(inlist, temp, n_grey=16, window=c(11,11), shift=c(1,1), min_x=NULL, max_x=NULL){

  require(glcm)
  #todisk option required if output rasters are small enough to fit in memory
  rasterOptions(tmpdir = temp, todisk=T, maxmemory = 1E8)

  ## run glcm over tile
  r_glcm=glcm(inlist[[i]], n_grey = n_grey, window = window, shift=shift, min_x=min_x, max_x=max_x, na_opt = 'any')

}

and here:

tilebuild_buff <- function(r, nx=5, ny=2, buffer=c(0,0)){

round_xw = floor(ncol(r)/nx)
xsize = c(rep(round_xw,nx-1), round_xw + ncol(r)%%nx)
xoff = c(0,cumsum(rep(round_xw,nx-1)))

round_yh = floor(nrow(r)/ny)
ysize = c(rep(round_yh,ny-1), round_yh + nrow(r)%%ny)
yoff = c(0,cumsum(rep(round_yh,ny-1)))

pix_widths = expand.grid(xsize = xsize ,ysize = ysize)
offsets = expand.grid(xoff = xoff,yoff = yoff)

srcwins = cbind(offsets,pix_widths)

srcwins_buff = srcwins

#Add buffer
srcwins_buff$xoff = srcwins$xoff - buffer[1]
srcwins_buff$yoff = srcwins$yoff - buffer[2]
srcwins_buff$xsize = srcwins$xsize + 2*buffer[1]
srcwins_buff$ysize = srcwins$ysize + 2*buffer[2]

return(srcwins_buff)

}