Home > Software engineering >  Webscraping all hidden/nested options of a webform as a table using R
Webscraping all hidden/nested options of a webform as a table using R

Time:09-30

I'm trying to scrape all form options/combinations from a url. However, it is designed in a hierarchical search format such that the next 3 layers of options wont show until you select an option from the first layer (State). I have tried looking at codes here but can't find one that handles nested options with strings. Here is my sample code so far:

library(rvest)
library(dplyr)
library(purrr)
library(stringr) 

url <- "https://inecnigeria.org/elections/polling-units"
pgsession <- session(url)
pgform1 <- html_form(pgsession)[[2]]
pgform2 <- html_form(pgsession)[[3]]

pgform1
<form> 'pollForm' (GET )
 <field> (select) states: 
 <field> (select) : 
 <field> (select) : 
 <field> (button) : Click to Search

 pgform2
<form> 'form1' (GET )
 <field> (select) states: 
 <field> (select) : 
 <field> (select) : 
 <field> (button) : Click to Search

#Extract all values for states (the first level of options)
ST1 <- pgform1[['fields']][['states']][['options']]
ST2 <- names(ST1)

head(ST2)
[1] "Choose State" "ABIA"         "ADAMAWA"      "AKWA IBOM"    "ANAMBRA"      "BAUCHI"  

pgform1[['fields']][[2]][['options']]
 named character(0)

Only the states show initially so I'm not able to fill in options for the next levels. The next level which is LGA (I dont know what it is titled as in the html script) only shows after you have selected a state and the next level which is Ward (I don't know what it is titled as in the html script) only shows when you have selected an LGA. The Polling units will then be listed after these selection. How do I scrape all the possible combinations from the hierarchy of options available? I hope to have a table that shows all combinations of the State, LGA, Ward, Polling Unit ID, Polling Unit Name and Remark.

EDIT: I tried this and got a new error

 pgform2 <- set_values(pgform2, states = "Abia")
 result <- rvest::submit_form(session, pgform2, submit = "Click to Search" , httr::add_headers('x-requested-with' = 'XMLHttpRequest'))

This produced the error below:

Error in `submission_build()`:
! `form` doesn't contain a `action` attribute

CodePudding user response:

I have been able to fill the form and get the table with the following code :

library(stringr)
library(RSelenium)

rd <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = 4449L) 
remDr <- rd$client
remDr$open()
url <- "https://inecnigeria.org/elections/polling-units/"
remDr$navigate(url)

Sys.sleep(1)

web_Obj_State_Poll <- remDr$findElement("id", "statePoll")
web_Obj_State_Poll$sendKeysToElement(list("EDO"))

Sys.sleep(1)

web_Obj_LGA_Poll <- remDr$findElement("id", "lgaPoll")
web_Obj_LGA_Poll$sendKeysToElement(list("IVO"))

Sys.sleep(1)

web_Obj_Ward_Poll <- remDr$findElement("id", "wardPoll")
web_Obj_Ward_Poll$sendKeysToElement(list("AUCHI"))

Sys.sleep(1)

web_Obj_SearchPoll <- remDr$findElement("id", "SearchPoll")
web_Obj_SearchPoll$clickElement()

web_Obj_Result <- remDr$findElement("id", "result")
strsplit(web_Obj_Result$getElementText()[[1]], "\n")[[1]]

[1] "Polling Unit ID Polling Unit Name Remark"            "001 IGUEBEN - AFUDA P/S/ I EXISTING PU"             
 [3] "002 IGUEBEN - AFUDA P/S/ II EXISTING PU"             "003 IGUEBEN - AFUDA P/S/ III EXISTING PU"           
 [5] "004 IGUEBEN - IDUMUOKA P/S/ EXISTING PU"             "005 OPEN SPACE BY AFUDA HEALTH CENTER NEW PU"       
 [7] "006 EGBESAN PRY SCH NEW PU"                          "007 AFUDA COMMUNITY HALL NEW PU"                    
 [9] "008 OPEN SPACE BY IDUMONKA PRY HEALTH CENTER NEW PU" "009 IDUMONKA COMMUNITY HALL NEW PU" 

CodePudding user response:

You can consider something like this :

library(stringr)
library(RSelenium)

rd <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = 4459L) 
remDr <- rd$client
remDr$open()
url <- "https://inecnigeria.org/elections/polling-units/"
remDr$navigate(url)

Sys.sleep(1)

########################################
#### Get all the values of the form ####
########################################
web_Obj_State_Poll <- remDr$findElement("id", "statePoll")
html_Content_State_Poll <- web_Obj_State_Poll$getPageSource()[[1]]
options_State_Poll <- stringr::str_extract_all(html_Content_State_Poll, "<option value[^<]*\\d{1,2}[^<]*</option>")[[1]]
options_State_Poll <- stringr::str_replace_all(options_State_Poll, "(<option[^<]*>)(([:alpha:]|[:space:])*)(</option>)", "\\2")
web_Obj_State_Poll$sendKeysToElement(list(options_State_Poll[1]))

Sys.sleep(15)

web_Obj_LGA_Poll <- remDr$findElement("id", "lgaPoll")
web_Obj_LGA_Poll$sendKeysToElement(list("IVO"))
html_Content_LGA_Poll <- remDr$executeScript("return arguments[0].innerHTML", list(web_Obj_LGA_Poll))[[1]]
options_LGA_Poll <- stringr::str_extract_all(html_Content_LGA_Poll, "<option value[^<]*\\d{1,2}[^<]*</option>")[[1]]
options_LGA_Poll <- stringr::str_replace_all(options_LGA_Poll, "(<option[^<]*>)(([:alpha:]|[:space:]|-)*)(</option>)", "\\2")
web_Obj_LGA_Poll$sendKeysToElement(list(options_LGA_Poll[1]))

Sys.sleep(15)

web_Obj_Ward_Poll <- remDr$findElement("id", "wardPoll")
web_Obj_Ward_Poll$sendKeysToElement(list("AUCHI"))
html_Content_Ward_Poll <- remDr$executeScript("return arguments[0].innerHTML", list(web_Obj_Ward_Poll))[[1]]
options_Ward_Poll <- stringr::str_extract_all(html_Content_Ward_Poll, "<option value[^<]*\\d{1,2}[^<]*</option>")[[1]]
options_Ward_Poll <- stringr::str_replace_all(options_Ward_Poll, "(<option[^<]*>)(([:alpha:]|[:space:]|-)*)(</option>)", "\\2")
web_Obj_Ward_Poll$sendKeysToElement(list(options_Ward_Poll[1]))

Sys.sleep(15)

##############################################
#### Loop over all the values of the form ####
##############################################

list_Table_Form <- list()
nb_State_Poll <- length(options_State_Poll)
nb_Lga_Poll <- length(options_LGA_Poll)
nb_Ward_Poll <- length(options_Ward_Poll)

for(l1 in 1 : nb_State_Poll)
{
  list_Table_Form[[l1]] <- list()
  
  for(l2 in 1 : nb_Lga_Poll)
  {
    list_Table_Form[[l1]][[l2]] <- list()
    
    for(l3 in 1 : nb_Ward_Poll)
    {
      url <- "https://inecnigeria.org/elections/polling-units/"
      remDr$navigate(url)

      message_Print <- paste0("l1 : ", l1, " l2 : ", l2, " l3 : ", l3)
      print(message_Print)
       
      Sys.sleep(15)
      
      web_Obj_State_Poll <- remDr$findElement("id", "statePoll")
      web_Obj_State_Poll$sendKeysToElement(list(options_State_Poll[l1]))
      
      Sys.sleep(15)
      
      web_Obj_LGA_Poll <- remDr$findElement("id", "lgaPoll")
      web_Obj_LGA_Poll$sendKeysToElement(list(options_LGA_Poll[l2]))
      
      Sys.sleep(15)
      
      web_Obj_LGA_Poll <- remDr$findElement("id", "wardPoll")
      web_Obj_LGA_Poll$sendKeysToElement(list(options_Ward_Poll[l3]))
      
      Sys.sleep(15)
      
      web_Obj_SearchPoll <- remDr$findElement("id", "SearchPoll")
      web_Obj_SearchPoll$clickElement()
      
      web_Obj_Result <- remDr$findElement("id", "result")
      list_Table_Form[[l1]][[l2]][[l3]] <- strsplit(web_Obj_Result$getElementText()[[1]], "\n")[[1]]
    }
  }
}
  • Related