I need to scrape the data from this url https://apps.fas.usda.gov/export-sales/corn.htm but I didn´t have success:
library(dplyr)
library(tidyverse)
library(rvest)
url <- "https://apps.fas.usda.gov/export-sales/corn.htm"
page <- read_html(url)
df <- url %>%
read_html() %>%
html_nodes("table") %>%
html_table(fill = TRUE) %>%
as.data.frame()
>df
data frame with 0 columns and 0 rows
How can I solve this ?
CodePudding user response:
Here is another approach that can be considered :
library(pagedown)
library(pdftools)
chrome_print(input = "https://apps.fas.usda.gov/export-sales/corn.htm",
output = "C:\\stackoverflow101.pdf")
text <- pdftools::pdf_text("C:\\stackoverflow101.pdf")
text <- unlist(strsplit(text, "\r\n|\n"))
text
[1] "CORN - UNMILLED MARKETING YEAR 09/01 - 08/31"
[2] " OUTSTANDING EXPORT SALES AND EXPORTS BY COUNTRY, REGION AND MARKETING YEAR"
[3] "1000 METRIC TONS AS OF September 8 2022"
[4] "--------------------------------------------------------------------------------"
[5] " : CURRENT MARKETING YEAR :NEXT MARKETING YEAR"
[6] " ---------------------------------------------------------"
[7] " :OUTSTANDING SALES:ACCUMULATED EXPORTS: OUTSTANDING SALES"
[8] " ---------------------------------------------------------"
[9] " DESTINATION :THIS WEEK: YR AGO:THIS WEEK: YR AGO :SECOND YR: THIRD YR"
[10] "--------------------------------------------------------------------------------"
[11] " :"
[12] "EUROPEAN UNION - 27 : 105.2 0.1 * * 0.0 0.0"
[13] " ITALY : 105.0 0.0 0.0 0.0 0.0 0.0"
[14] " U KING : 0.2 0.1 * * 0.0 0.0"
[15] " :"
[16] "JAPAN : 908.8 1686.5 * * 0.0 0.0"
[17] " :"
[18] "TAIWAN : 89.3 153.5 3.9 1.0 0.0 0.0"
[19] " :"
[20] "CHINA : 3223.4 11901.2 137.1 0.0 0.0 0.0"
[21] " :"
[22] "OTHER ASIA AND OCEANIA: 40.3 72.8 0.1 0.4 0.0 0.0"
[23] " HG KONG : 0.1 1.1 * 0.2 0.0 0.0"
[24] " KOR REP : 6.6 71.5 0.1 0.2 0.0 0.0"
[25] " MALAYSA : 1.0 0.0 0.0 0.0 0.0 0.0"
[26] " PHIL : 2.7 0.2 0.0 0.0 0.0 0.0"
[27] " QATAR : 0.0 0.0 * 0.0 0.0 0.0"
[28] " S ARAB : 30.0 0.0 0.0 0.0 0.0 0.0"
[29] " :"
[30] "WESTERN HEMISPHERE : 5975.7 7991.2 322.5 358.4 90.5 0.0"
[31] " BARBADO : 8.0 6.3 4.4 0.0 0.0 0.0"
[32] " C RICA : 214.3 349.8 0.0 0.0 0.0 0.0"
[33] " CANADA : 35.6 556.0 1.9 33.6 0.0 0.0"
[34] " COLOMB : 182.0 854.2 10.8 0.0 0.0 0.0"
[35] " DOM REP : 33.0 35.8 0.0 0.0 0.0 0.0"
[36] " GUATMAL : 291.1 433.7 0.0 0.0 0.0 0.0"
[37] " HAITI : 0.0 2.3 0.0 0.0 0.0 0.0"
[38] " HONDURA : 196.4 214.8 71.1 0.0 0.0 0.0"
[39] " JAMAICA : 53.6 82.9 2.8 0.0 0.0 0.0"
[40] " LW WW I : 1.5 4.7 0.9 0.0 0.0 0.0"
[41] " MEXICO : 4723.2 4929.2 190.8 317.6 90.5 0.0"
[42] " NICARAG : 0.0 128.7 18.9 0.0 0.0 0.0"
[43] " PANAMA : 93.3 245.5 5.6 0.0 0.0 0.0"
[44] " SALVADR : 129.4 120.0 15.4 7.2 0.0 0.0"
[45] " TRINID : 0.0 15.4 0.0 0.0 0.0 0.0"
[46] " VENEZ : 14.5 12.0 0.0 0.0 0.0 0.0"
[47] "--------------------------------------------------------------------------------"
[48] "TOTAL KNOWN : 10342.7 21805.3 463.6 359.9 90.5 0.0"
[49] "TOTAL UNKNOWN : 1494.9 2407.5 0.0 0.0 0.0 0.0"
[50] "--------------------------------------------------------------------------------"
[51] "TOTAL KNOWN & UNKNOWN : 11837.7 24212.8 463.6 359.9 90.5 0.0"
[52] "EXPORTS FOR OWN ACCT : - - 0.0 0.0 - -"
[53] "OPTIONAL ORIGIN : 0.0 170.0 - - 0.0 0.0"
[54] "--------------------------------------------------------------------------------"
CodePudding user response:
Here is an improved version of my previous answer :
library(pagedown)
library(pdftools)
library(stringr)
chrome_print(input = "https://apps.fas.usda.gov/export-sales/corn.htm",
output = "C:\\stackoverflow101.pdf")
text <- pdftools::pdf_text("C:\\stackoverflow101.pdf")
text <- unlist(strsplit(text, "\r\n|\n"))
text <- text[-(1 : 10)]
text <- text[-44]
text <- text[text != " :"]
text <- text[text != "--------------------------------------------------------------------------------"]
regex <- "(.*:)|([:space:]*(\\d{1,4}\\.\\d{1}|\\*|-))"
list_Numbers <- stringr::str_extract_all(text, pattern = regex)
mat_Numbers <- do.call("rbind", list_Numbers)
mat_Numbers
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "EUROPEAN UNION - 27 :" " 105.2" " 0.1" " *" " *" " 0.0" " 0.0"
[2,] " ITALY :" " 105.0" " 0.0" " 0.0" " 0.0" " 0.0" " 0.0"
[3,] " U KING :" " 0.2" " 0.1" " *" " *" " 0.0" " 0.0"
[4,] "JAPAN :" " 908.8" " 1686.5" " *" " *" " 0.0" " 0.0"
[5,] "TAIWAN :" " 89.3" " 153.5" " 3.9" " 1.0" " 0.0" " 0.0"
[6,] "CHINA :" " 3223.4" "1901.2" " 137.1" " 0.0" " 0.0" " 0.0"
[7,] "OTHER ASIA AND OCEANIA:" " 40.3" " 72.8" " 0.1" " 0.4" " 0.0" " 0.0"
[8,] " HG KONG :" " 0.1" " 1.1" " *" " 0.2" " 0.0" " 0.0"
[9,] " KOR REP :" " 6.6" " 71.5" " 0.1" " 0.2" " 0.0" " 0.0"
[10,] " MALAYSA :" " 1.0" " 0.0" " 0.0" " 0.0" " 0.0" " 0.0"
[11,] " PHIL :" " 2.7" " 0.2" " 0.0" " 0.0" " 0.0" " 0.0"
[12,] " QATAR :" " 0.0" " 0.0" " *" " 0.0" " 0.0" " 0.0"
[13,] " S ARAB :" " 30.0" " 0.0" " 0.0" " 0.0" " 0.0" " 0.0"
[14,] "WESTERN HEMISPHERE :" " 5975.7" " 7991.2" " 322.5" " 358.4" " 90.5" " 0.0"
[15,] " BARBADO :" " 8.0" " 6.3" " 4.4" " 0.0" " 0.0" " 0.0"
[16,] " C RICA :" " 214.3" " 349.8" " 0.0" " 0.0" " 0.0" " 0.0"
[17,] " CANADA :" " 35.6" " 556.0" " 1.9" " 33.6" " 0.0" " 0.0"
[18,] " COLOMB :" " 182.0" " 854.2" " 10.8" " 0.0" " 0.0" " 0.0"
[19,] " DOM REP :" " 33.0" " 35.8" " 0.0" " 0.0" " 0.0" " 0.0"
[20,] " GUATMAL :" " 291.1" " 433.7" " 0.0" " 0.0" " 0.0" " 0.0"
[21,] " HAITI :" " 0.0" " 2.3" " 0.0" " 0.0" " 0.0" " 0.0"
[22,] " HONDURA :" " 196.4" " 214.8" " 71.1" " 0.0" " 0.0" " 0.0"
[23,] " JAMAICA :" " 53.6" " 82.9" " 2.8" " 0.0" " 0.0" " 0.0"
[24,] " LW WW I :" " 1.5" " 4.7" " 0.9" " 0.0" " 0.0" " 0.0"
[25,] " MEXICO :" " 4723.2" " 4929.2" " 190.8" " 317.6" " 90.5" " 0.0"
[26,] " NICARAG :" " 0.0" " 128.7" " 18.9" " 0.0" " 0.0" " 0.0"
[27,] " PANAMA :" " 93.3" " 245.5" " 5.6" " 0.0" " 0.0" " 0.0"
[28,] " SALVADR :" " 129.4" " 120.0" " 15.4" " 7.2" " 0.0" " 0.0"
[29,] " TRINID :" " 0.0" " 15.4" " 0.0" " 0.0" " 0.0" " 0.0"
[30,] " VENEZ :" " 14.5" " 12.0" " 0.0" " 0.0" " 0.0" " 0.0"
[31,] "TOTAL KNOWN :" "0342.7" "1805.3" " 463.6" " 359.9" " 90.5" " 0.0"
[32,] "TOTAL UNKNOWN :" " 1494.9" " 2407.5" " 0.0" " 0.0" " 0.0" " 0.0"
[33,] "TOTAL KNOWN & UNKNOWN :" "1837.7" "4212.8" " 463.6" " 359.9" " 90.5" " 0.0"
[34,] "EXPORTS FOR OWN ACCT :" " -" " -" " 0.0" " 0.0" " -" " -"
[35,] "OPTIONAL ORIGIN :" " 0.0" " 170.0" " -" " -" " 0.0" " 0.0"