Home > Blockchain >  Download specific files from url in r
Download specific files from url in r

Time:06-26

I would like to download multiple files (around 2000) from this url : https://www.star.nesdis.noaa.gov/pub/corp/scsb/wguo/data/Blended_VH_4km/geo_TIFF/

However, to limit time and space, I would like to download only the files that contain the name VCI.tif and only the years between 1981 - 2011.

I used wget on bash but could not find a way to select what I want. Additionally, the space consumed is huge (more than 140G).

Thank you !

CodePudding user response:

Im sure this can be optimised but give this Base R solution a try (note this is not tested due to the size of files):

# Store the parent url of files we want to download:
# base_url => character scalar
base_url <- "https://www.star.nesdis.noaa.gov/pub/corp/scsb/wguo/data/Blended_VH_4km/geo_TIFF/"

# Read in the html: html_string => character vector
html_string <- readLines(
  base_url
)

# Store the range of years to search for, 
# flattened into a regex pattern: 
# rng => character scalar
rng <- paste0(
  seq(
    1981, 
    2011,
    by = 1
  ),
  collapse = "|"
)

# Create a vector of urls requiring download: 
# input_urls => character vector
input_urls <- Filter(
  function(y){
    grepl(
      rng, 
      y
    )
  },
  unlist(
    lapply(
      strsplit(
        html_string, 
        "href\\="
      ),
      function(x){
        ir <- paste0(
          base_url, 
          unlist(
            strsplit(
              gsub(
              '^"', 
              '', 
              gsub(
                "(\\w VCI\\.tif).*", 
                "\\1", 
                noquote(x)
              )
            ),
          "\\<\\/td\\>\\<td\\>\\<a"
            )
          )
        )[2]
      }
    )
  )
)

# Store the desired output folder here: 
# dir_path => character scalar
dir_path <- paste0(
  getwd(),
  "/geo_tifs"
)

# Function to create a directory if it doesnt already
# exist: r_mkdir => function() 
r_mkdir <- function(dir_path){
  if(dir.exists(dir_path)){
    invisible()
  }else{
    dir.create(dir_path)
  }
}

# Create the directory if it doesnt already exist: 
# directory => stdout(file system)
r_mkdir(dir_path)

# Generate a character vector of output file paths: 
# output_file_paths => character vector
output_file_paths <- vapply(
  strsplit(tmp, "geo_TIFF"),
  function(x){
    paste0(dir_path, x[2])
  },
  character(1)
)

# Download the files to the output paths: 
# .tif files => stdout(file path)
lapply(
  seq_along(output_file_paths),
  function(i){
    download.file(
      input_urls[i],
      output_file_names[i],
      quiet = TRUE,
      cacheOK = FALSE
    )
  }
)

CodePudding user response:

The following uses wget and it works at least with the first 2 files, I have tested the downloads of a (very) small subset of the wanted files.

suppressPackageStartupMessages({
  library(httr)
  library(rvest)
  library(dplyr)
  library(stringr)
})

# big files need greater timeout values,
# since I'm using wget this is probably
# unnecessary
old_timeout <- options(timeout = 300)
getOption("timeout")

year_start <- 1981
year_end <- 2011
download_dir <- "~/Temp"
wget_cmd_line <- c("-P", download_dir, "")

link <- "https://www.star.nesdis.noaa.gov/pub/corp/scsb/wguo/data/Blended_VH_4km/geo_TIFF/"
page <- read_html(link)

files_urls <- page %>%
  html_elements("a") %>%
  html_attr("href")

wanted_urls <- files_urls %>%
  str_extract(pattern = "^.*\\.VCI\\.tif$") %>%
  na.omit() %>%
  data.frame(filename = .) %>% 
  mutate(year = str_extract(filename, "\\d{7}"),
         year = str_extract(year, "^\\d{4}"),
         year = as.integer(year)) %>%
  filter(year >= year_start & year <= year_end)

wanted_urls %>%
  #
  # to test the code I only download 2 files;
  # comment out this instruction to download all of them
  head(n = 2) %>%
  #
  pull(filename) %>%
  lapply(\(x) {
    wget_cmd <- wget_cmd_line
    wget_cmd[2] <- file.path(wget_cmd[2], x)
    wget_cmd[3] <- paste0(link, x)
    system2("wget", args = wget_cmd, stdout = TRUE, stderr = TRUE)
  })

# put saved value back
options(old_timeout)
  • Related