I've been given a pile of about 100 html files that I want to put into rectangular form. Here's an example: http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/abergsson-anna.html . I would like to extract headings (h3) as column names and the content in between as strings taking up one row each.
I've managed to extract the column names with Rvest in R, but I'm stuck at extracting the content. I'm sure I'll get stuck again when trying to bind everything together in one dataframe.
This is what I've done for extracting the variable names:
variable.names <- map(LIST.html, ~read_html(.x) %>%
html_nodes("h3") %>%
html_text(trim = TRUE) %>%
tolower())
Here's the code I used to get all the files: system( "wget -r -np -nH --cut-dirs=3 -R index.html http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/" )
CodePudding user response:
library(rvest)
library(stringr)
library(data.table)
malformed_documents <- character(0)
parse_profile_page <- function(pg, nm) {
# extract section divs, omit byline
divs <- html_nodes(pg, '#mittvagn > div')[ -1 ]
idx <- which((lapply(divs, html_attr, 'id') |> unlist()) == 'bakvagn')
if (length(idx) > 0) {
divs <- divs[ -idx ]
}
# extract section headers
titles <- html_nodes(divs, 'h3') |>
html_text(trim = TRUE) |>
tolower() |>
unlist()
# extract section contents
paragraphs <- lapply(divs, html_nodes, 'p') |>
lapply(html_text, trim = TRUE) |>
lapply(paste0, collapse = '\n') |>
lapply(str_squish) |>
unlist()
if (length(paragraphs) != length(titles)) {
message(sprintf('%s is malformed, not parsing', nm))
malformed_documents <<- c(malformed_documents, nm)
return(data.frame())
}
df <- data.frame(title = titles,
contents = paragraphs,
url = rep(nm, length(paragraphs)))
df
}
# obtain list of files to download
url <- 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/'
pg <- read_html(url)
file_urls <- html_nodes(pg, '#mittvagn > ol > li > a:nth-child(1)') |>
html_attr('href') |>
str_replace_all('^\\.\\./', '') |>
sprintf(fmt= 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/%s')
# file_urls <- sample(file_urls, 10) # uncomment to run on a small sample of pages
file_contents <- lapply(file_urls, function(x) {
message('downloading: ', x)
fl <- read_html(x)
})
names(file_contents) <- file_urls
parsed_contents <- lapply(file_urls, function(x) {
message('parsing: ', x)
pg <- file_contents[[ x ]]
parse_profile_page(pg, x)
})
parsed_contents_df <- rbindlist(parsed_contents) |> as.data.frame()
if (length(malformed_documents) > 0) {
warning('the following documents were malformed and not parsed: %s', paste0(malformed_documents, collapse = ', '))
}
View(parsed_contents_df)