I would like to download multiple files (around 2000) from this url : https://www.star.nesdis.noaa.gov/pub/corp/scsb/wguo/data/Blended_VH_4km/geo_TIFF/
However, to limit time and space, I would like to download only the files that contain the name VCI.tif and only the years between 1981 - 2011.
I used wget on bash but could not find a way to select what I want. Additionally, the space consumed is huge (more than 140G).
Thank you !
CodePudding user response:
Im sure this can be optimised but give this Base R solution a try (note this is not tested due to the size of files):
# Store the parent url of files we want to download:
# base_url => character scalar
base_url <- "https://www.star.nesdis.noaa.gov/pub/corp/scsb/wguo/data/Blended_VH_4km/geo_TIFF/"
# Read in the html: html_string => character vector
html_string <- readLines(
base_url
)
# Store the range of years to search for,
# flattened into a regex pattern:
# rng => character scalar
rng <- paste0(
seq(
1981,
2011,
by = 1
),
collapse = "|"
)
# Create a vector of urls requiring download:
# input_urls => character vector
input_urls <- Filter(
function(y){
grepl(
rng,
y
)
},
unlist(
lapply(
strsplit(
html_string,
"href\\="
),
function(x){
ir <- paste0(
base_url,
unlist(
strsplit(
gsub(
'^"',
'',
gsub(
"(\\w VCI\\.tif).*",
"\\1",
noquote(x)
)
),
"\\<\\/td\\>\\<td\\>\\<a"
)
)
)[2]
}
)
)
)
# Store the desired output folder here:
# dir_path => character scalar
dir_path <- paste0(
getwd(),
"/geo_tifs"
)
# Function to create a directory if it doesnt already
# exist: r_mkdir => function()
r_mkdir <- function(dir_path){
if(dir.exists(dir_path)){
invisible()
}else{
dir.create(dir_path)
}
}
# Create the directory if it doesnt already exist:
# directory => stdout(file system)
r_mkdir(dir_path)
# Generate a character vector of output file paths:
# output_file_paths => character vector
output_file_paths <- vapply(
strsplit(tmp, "geo_TIFF"),
function(x){
paste0(dir_path, x[2])
},
character(1)
)
# Download the files to the output paths:
# .tif files => stdout(file path)
lapply(
seq_along(output_file_paths),
function(i){
download.file(
input_urls[i],
output_file_names[i],
quiet = TRUE,
cacheOK = FALSE
)
}
)
CodePudding user response:
The following uses wget
and it works at least with the first 2 files, I have tested the downloads of a (very) small subset of the wanted files.
suppressPackageStartupMessages({
library(httr)
library(rvest)
library(dplyr)
library(stringr)
})
# big files need greater timeout values,
# since I'm using wget this is probably
# unnecessary
old_timeout <- options(timeout = 300)
getOption("timeout")
year_start <- 1981
year_end <- 2011
download_dir <- "~/Temp"
wget_cmd_line <- c("-P", download_dir, "")
link <- "https://www.star.nesdis.noaa.gov/pub/corp/scsb/wguo/data/Blended_VH_4km/geo_TIFF/"
page <- read_html(link)
files_urls <- page %>%
html_elements("a") %>%
html_attr("href")
wanted_urls <- files_urls %>%
str_extract(pattern = "^.*\\.VCI\\.tif$") %>%
na.omit() %>%
data.frame(filename = .) %>%
mutate(year = str_extract(filename, "\\d{7}"),
year = str_extract(year, "^\\d{4}"),
year = as.integer(year)) %>%
filter(year >= year_start & year <= year_end)
wanted_urls %>%
#
# to test the code I only download 2 files;
# comment out this instruction to download all of them
head(n = 2) %>%
#
pull(filename) %>%
lapply(\(x) {
wget_cmd <- wget_cmd_line
wget_cmd[2] <- file.path(wget_cmd[2], x)
wget_cmd[3] <- paste0(link, x)
system2("wget", args = wget_cmd, stdout = TRUE, stderr = TRUE)
})
# put saved value back
options(old_timeout)