How to scrape from headings and content-CodePudding

I've been given a pile of about 100 html files that I want to put into rectangular form. Here's an example: http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/abergsson-anna.html . I would like to extract headings (h3) as column names and the content in between as strings taking up one row each.

I've managed to extract the column names with Rvest in R, but I'm stuck at extracting the content. I'm sure I'll get stuck again when trying to bind everything together in one dataframe.

This is what I've done for extracting the variable names:

variable.names <- map(LIST.html, ~read_html(.x) %>% 
                       html_nodes("h3") %>% 
                       html_text(trim = TRUE) %>% 
                       tolower())

Here's the code I used to get all the files: system( "wget -r -np -nH --cut-dirs=3 -R index.html http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/" )

CodePudding user response：

library(rvest)
library(stringr)
library(data.table)

malformed_documents <- character(0)

parse_profile_page <- function(pg, nm) {
  # extract section divs, omit byline
  divs <- html_nodes(pg, '#mittvagn > div')[ -1 ]
  idx <- which((lapply(divs, html_attr, 'id') |> unlist()) == 'bakvagn')
  if (length(idx) > 0) {
    divs <- divs[ -idx ]
  }
  # extract section headers
  titles <- html_nodes(divs, 'h3') |> 
    html_text(trim = TRUE) |> 
    tolower() |>
    unlist()
  # extract section contents
  paragraphs <- lapply(divs, html_nodes, 'p') |> 
    lapply(html_text, trim = TRUE) |> 
    lapply(paste0, collapse = '\n') |> 
    lapply(str_squish) |> 
    unlist()
  if (length(paragraphs) != length(titles)) {
    message(sprintf('%s is malformed, not parsing', nm))
    malformed_documents <<- c(malformed_documents, nm)
    return(data.frame())
  }
  df <- data.frame(title = titles, 
                   contents = paragraphs, 
                   url = rep(nm, length(paragraphs)))
  df
}

# obtain list of files to download
url <- 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/kapital/'
pg <- read_html(url)
file_urls <- html_nodes(pg, '#mittvagn > ol > li > a:nth-child(1)') |> 
  html_attr('href') |> 
  str_replace_all('^\\.\\./', '') |> 
  sprintf(fmt= 'http://www.skeptron.uu.se/broady/arkiv/a/ffo/%s')
# file_urls <- sample(file_urls, 10) # uncomment to run on a small sample of pages

file_contents <- lapply(file_urls, function(x) {
  message('downloading: ', x)
  fl <- read_html(x)
})
names(file_contents) <- file_urls

parsed_contents <- lapply(file_urls, function(x) {
  message('parsing: ', x)
  pg <- file_contents[[ x ]]
  parse_profile_page(pg, x)
})
parsed_contents_df <- rbindlist(parsed_contents) |> as.data.frame()
if (length(malformed_documents) > 0) {
  warning('the following documents were malformed and not parsed: %s', paste0(malformed_documents, collapse = ', '))  
}
View(parsed_contents_df)