I am trying to scrape data from the following websites with 4 dropdown menus - after clicking each dropdown menus they show a table from where I want to scrape data. I want to combine information from all tables from all dropdown menus. I am using R Selenium package - however as I am very new to web scraping, I could not understand how to make loop with the four available options to get the final table.
https://hindi.iocl.com/lpgdistributors.aspx
I tried the previous discussion on webscraping and modify the code accordingly.
library(RSelenium)
library("rvest")
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
rD <- rsDriver(browser = c("firefox")) #specify browser type you want Selenium to open
remDr <- rD$client
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") # navigates to webpage
# select first dropdown list
option <- remDr$findElement(using='id', value="cmbState")
#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>%
str_extract_all("1[0-9]{3}")
# select 2nd dropdown list
option <- remDr$findElement(using='id', value="cmbDistrict")
#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>%
str_extract_all("1[0-9]{3}")
# select 3rd dropdown list
option <- remDr$findElement(using='id', value="cmbMarket")
#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>%
str_extract_all("1[0-9]{3}")
#select 4th dropdown list
option2 <- remDr$findElement(using='id', value="cmbArea")
#get all option values from dropdown list
option_values_2 <- option2$getElementText() %>%
str_split("\\n") %>%
unlist()
#### create loop to loop over all tables...
option <- remDr$findElement(using='id', value="cmbState")
option <- remDr$findElement(using = 'xpath', "//*/option[@value = '1']") #change '1194' to values in option_values in loop
option$clickElement()
# change dropdown selection
option2 <- remDr$findElement(using='id', value="cmbDistrict")
option2 <- remDr$findElement(using = 'xpath', "//*/option[@value = '185']") #change 'AHB' to values in option_values_2 in loop
option2$clickElement()
# change dropdown selection
option3 <- remDr$findElement(using='id', value="cmbMarket")
option3 <- remDr$findElement(using = 'xpath', "//*/option[@value = '2314']") #change 'AHB' to values in option_values_2 in loop
option3$clickElement()
# change dropdown selection
option4 <- remDr$findElement(using='id', value="cmbArea")
option4 <- remDr$findElement(using = 'xpath', "//*/option[@value = '57']") #change 'AHB' to values in option_values_2 in loop
option4$clickElement()
# click submit
submit <- remDr$findElement(using='id', value="btnSearch")
submit$clickElement()
#get table
tb <- remDr$findElement(using='id', value="grdDistributors")
tb$getPageSource()[[1]] %>%
read_html() %>%
html_table(fill = TRUE)
CodePudding user response:
Here is a partial solution using RSelenium
,
library(RSelenium)
driver <- rsDriver(browser = "chrome")
remDr<-driver[["client"]]
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx")
Get List of All the states
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbState"]')
states <- webElem$getElementText()
states= unlist(states)
#removing line breakers and converting to vector
states = sapply(strsplit(states, split='\n ', fixed=TRUE), `[`)
[,1]
[1,] " Andaman & Nicobar"
[2,] "Andhra Pradesh"
[3,] "Arunachal Pradesh"
[4,] "Assam"
[5,] "Bihar"
[6,] "Chandigarh"
[7,] "Chhatisgarh"
[8,] "Goa"
[9,] "Gujarat"
[10,] "Haryana"
[11,] "Himachal Pradesh"
[12,] "Jammu & Kashmir"
[13,] "Jharkhand"
[14,] "Karnataka"
Now you have to loop through the states to get list of all the districts, one example,
### Select the State
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx")
opt1 <- remDr$findElement(using='xpath', value= '//*[@id="cmbState"]')
opt1$clickElement()
opt1$sendKeysToElement(list('Karnataka'))
### Get list of all the districts
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbDistrict"]')
webElem$clickElement()
district <- webElem$getElementText()
district = unlist(district)
district = sapply(strsplit(district, split='\n ', fixed=TRUE), `[`)
[,1]
[1,] " Bagalkot"
[2,] "Bangalore"
[3,] "Belgaum"
[4,] "Bellary"
[5,] "Bidar"
[6,] "Bijapur"
[7,] "Chamrajnagar"
[8,] "Chickmagalur"
[9,] "Chitradurga"
[10,] "Coorg"
[11,] "Dakshina Kannada"
[12,] "Davangere"
[13,] "Dharwad"
### Get list of all the markets
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbMarket"]')
webElem$clickElement()
market <- webElem$getElementText()
market = unlist(market)
market = sapply(strsplit(market, split='\n ', fixed=TRUE), `[`)
[,1]
[1,] " Afzalpur"
[2,] "Ajjampur"
[3,] "Aland"
[4,] "Ankola"
[5,] "B. Mathikere"
[6,] "Bagalkot"
[7,] "Bailhongal"
[8,] "Bandipur"
[9,] "Bangalore"
[10,] "Basavakalyan"
[11,] "Belgaum"
[12,] "Bellary"
[13,] "Belthangadi"
[14,] "Bhadravathi"
[15,] "Bhadravati"
[16,] "Bhatkal"
[17,] "Bidar"
[18,] "Bijapur"
[19,] "Challakere"
CodePudding user response:
You can do easy with RSelenium
, but before you have to check all values of the drop menu.
Before creating an example it is necessary to understand that to interact with a drop-down menu you have to study the html code behind the page and to know each elements of the menu.
This is a example:
library(RSelenium)
#Selenium environment activation
rD <- rsDriver(browser = "firefox", check = FALSE)
remDr <- rD[["client"]]
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx")
# We collect all elements that are present in the drop menu and we insert in a list
menu_1<-list("Assam")
menu_2<-list("Goalpara")
menu_3<-list("Goalpara")
menu_4<-list("Guwahati")
# We create a four for loops with two focus:
# 1. by click we active the drop menu
# 2. in this case we sent the text (the format should be factor) to choose
for( i in menu_1) {
first_element<-as.factor(i)
remDr$findElement(using = 'xpath', value = '//*[@id="cmbState"]')$clickElement()
remDr$findElement(using = 'xpath', value = '//*[@id="cmbState"]')$sendKeysToElement(list(first_element))
Sys.sleep(1) #Is better to implement a sleep
for( j in menu_2) {
first_element<-as.factor(j)
remDr$findElement(using = 'xpath', value = '//*[@id="cmbDistrict"]')$clickElement()
remDr$findElement(using = 'xpath', value = '//*[@id="cmbDistrict"]')$sendKeysToElement(list(first_element))
Sys.sleep(1) #Is better to implement a sleep
for( e in menu_3) {
first_element<-as.factor(e)
remDr$findElement(using = 'xpath', value = '//*[@id="cmbMarket"]')$clickElement()
remDr$findElement(using = 'xpath', value = '//*[@id="cmbMarket"]')$sendKeysToElement(list(first_element))
Sys.sleep(1) #Is better to implement a sleep
for( f in menu_4) {
first_element<-as.factor(f)
remDr$findElement(using = 'xpath', value = '//*[@id="cmbArea"]')$clickElement()
remDr$findElement(using = 'xpath', value = '//*[@id="cmbArea"]')$sendKeysToElement(list(first_element))
Sys.sleep(1) #Is better to implement a sleep
}
}
}
}
# Click the button for searching
remDr$findElement(using = 'xpath', value = '//*[@id="btnSearch"]')$clickElement()