In my toy/sample code below, I make a grid covering the world, and then use this grid to split a large complicated worldwide dataset into a file per each grid cell. In my real work this is a bottleneck and takes a long time. I would appreciate some thoughts and ideas on optimising it. I have had some success by leveraging parallel processing, but I also think it could be done 'smarter'.
library("sf")
library("terra")
library("glue")
library("rnaturalearth")
library("tidyverse")
dir.create("tmp")
ogr2ogr_path <- "C://Program Files//QGIS 3.26.1//bin//ogr2ogr.exe"
## Make some grid cells
world_grid <- rast(nrows=1, ncols=1, xmin=-180, xmax=180, ymin=-90, ymax=90, crs="epsg:4326") %>%
st_bbox() %>%
st_as_sfc() %>%
st_make_grid(cellsize = 10) %>%
st_as_sf()
make_grid_cells <- function(grid_id, world_grid) {
output_name <- glue("tmp/polygon_{grid_id}.gpkg")
st_write(world_grid[grid_id,],
output_name,
append = FALSE,
quiet = TRUE)
return(output_name)
}
grid_cell <- lapply(1:nrow(world_grid), make_grid_cells, world_grid = world_grid)
## Get some sample data
ne_countries(type = "countries", scale = "large", returnclass = "sf") %>%
select(iso_a2) %>%
st_write("tmp/world_polygons.gpkg")
## Split the worldwide data into tiles
split_world_to_tiles <- function(tile_template_area, worldwide_data) {
output_name <- gsub("polygon", "worldwide_poly", tile_template_area)
grid_poly <- st_read(tile_template_area, quiet=T)
box <- grid_poly %>% st_bbox()
command <- glue('{double_quote("C://Program Files//QGIS 3.26.1//bin//ogr2ogr.exe")} -spat {box$xmin} {box$ymin} {box$xmax} {box$ymax} -clipsrc spat_extent -f GPKG {output_name} {worldwide_data} -nlt GEOMETRYCOLLECTION')
system(command)
}
split_worldwide_data <- lapply(grid_cell, split_world_to_tiles, worldwide_data = "tmp/world_polygons.gpkg")
CodePudding user response:
It seems that you can get a lot more mileage if you use a different file format.
library(terra)
dir.create("tmp", FALSE, FALSE)
d <- file.remove(list.files("tmp", full=TRUE))
wrldgrid <- as.polygons(rast(res=10))
write_cells <- function(wgrid, format=".gpkg") {
nr <- nrow(wgrid)
outf <- paste0("tmp/polygon_", 1:nr, format)
for (i in 1:nr) {
writeVector(wgrid[i,], outf[i])
}
invisible(outf)
}
system.time(f <- write_cells(wrldgrid))
# user system elapsed
# 5.03 13.89 24.70
system.time(f <- write_cells(wrldgrid, ".shp"))
# user system elapsed
# 1.97 3.15 9.86
If you are going to use these files in R, you might as well save them to ".rds".
write_rds <- function(wgrid) {
nr <- nrow(wgrid)
outf <- paste0("tmp/polygon_", 1:nr, ".rds")
for (i in 1:nr) {
saveRDS(wgrid[i,], outf[i])
}
invisible(outf)
}
system.time(f <- write_rds(wrldgrid))
# user system elapsed
# 1.71 0.40 2.31