Home > Net >  Web scraping with R: with multiple dropdown menu
Web scraping with R: with multiple dropdown menu

Time:10-13

I am trying to scrape data from the following websites with 4 dropdown menus - after clicking each dropdown menus they show a table from where I want to scrape data. I want to combine information from all tables from all dropdown menus. I am using R Selenium package - however as I am very new to web scraping, I could not understand how to make loop with the four available options to get the final table.

https://hindi.iocl.com/lpgdistributors.aspx

I tried the previous discussion on webscraping and modify the code accordingly.

library(RSelenium)
library("rvest")

system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)

rD <- rsDriver(browser = c("firefox")) #specify browser type you want Selenium to open
remDr <- rD$client
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") # navigates to webpage


# select first dropdown list
option <- remDr$findElement(using='id', value="cmbState")

#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>% 
  str_extract_all("1[0-9]{3}")

# select 2nd dropdown list
option <- remDr$findElement(using='id', value="cmbDistrict")

#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>% 
  str_extract_all("1[0-9]{3}")

# select 3rd dropdown list
option <- remDr$findElement(using='id', value="cmbMarket")

#get all option values from dropdown list
option_values <- option$getPageSource()[[1]] %>% 
  str_extract_all("1[0-9]{3}")



#select 4th dropdown list
option2 <- remDr$findElement(using='id', value="cmbArea")

#get all option values from dropdown list
option_values_2 <- option2$getElementText() %>% 
  str_split("\\n") %>% 
  unlist()

#### create loop to loop over all tables...


option <- remDr$findElement(using='id', value="cmbState")

option <- remDr$findElement(using = 'xpath', "//*/option[@value = '1']") #change '1194' to values in option_values in loop
option$clickElement()

# change dropdown selection
option2 <- remDr$findElement(using='id', value="cmbDistrict")
option2 <- remDr$findElement(using = 'xpath', "//*/option[@value = '185']") #change 'AHB' to values in option_values_2 in loop
option2$clickElement()

# change dropdown selection
option3 <- remDr$findElement(using='id', value="cmbMarket")
option3 <- remDr$findElement(using = 'xpath', "//*/option[@value = '2314']") #change 'AHB' to values in option_values_2 in loop
option3$clickElement()

# change dropdown selection
option4 <- remDr$findElement(using='id', value="cmbArea")
option4 <- remDr$findElement(using = 'xpath', "//*/option[@value = '57']") #change 'AHB' to values in option_values_2 in loop
option4$clickElement()


# click submit
submit <- remDr$findElement(using='id', value="btnSearch")
submit$clickElement()


#get table
tb <- remDr$findElement(using='id', value="grdDistributors")

tb$getPageSource()[[1]] %>% 
  read_html() %>% 
  html_table(fill = TRUE)

CodePudding user response:

Here is a partial solution using RSelenium,

library(RSelenium)
driver <- rsDriver(browser = "chrome")
remDr<-driver[["client"]]
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") 

Get List of All the states

webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbState"]')
states <- webElem$getElementText()
states= unlist(states)
#removing line breakers and converting to vector
states = sapply(strsplit(states, split='\n ', fixed=TRUE), `[`)
      [,1]                   
 [1,] " Andaman & Nicobar"   
 [2,] "Andhra Pradesh"       
 [3,] "Arunachal Pradesh"    
 [4,] "Assam"                
 [5,] "Bihar"                
 [6,] "Chandigarh"           
 [7,] "Chhatisgarh"          
 [8,] "Goa"                  
 [9,] "Gujarat"              
[10,] "Haryana"              
[11,] "Himachal Pradesh"     
[12,] "Jammu & Kashmir"      
[13,] "Jharkhand"            
[14,] "Karnataka"    

 

Now you have to loop through the states to get list of all the districts, one example,

### Select the State
remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx") 
opt1 <- remDr$findElement(using='xpath', value= '//*[@id="cmbState"]')
opt1$clickElement()
opt1$sendKeysToElement(list('Karnataka'))

### Get list of all the districts 
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbDistrict"]')
webElem$clickElement()
district <- webElem$getElementText()
district = unlist(district)
district = sapply(strsplit(district, split='\n ', fixed=TRUE), `[`)
      [,1]              
 [1,] " Bagalkot"       
 [2,] "Bangalore"       
 [3,] "Belgaum"         
 [4,] "Bellary"         
 [5,] "Bidar"           
 [6,] "Bijapur"         
 [7,] "Chamrajnagar"    
 [8,] "Chickmagalur"    
 [9,] "Chitradurga"     
[10,] "Coorg"           
[11,] "Dakshina Kannada"
[12,] "Davangere"       
[13,] "Dharwad" 

### Get list of all the markets 
webElem <- remDr$findElement(using='xpath', value= '//*[@id="cmbMarket"]')
webElem$clickElement()
market <- webElem$getElementText()
market = unlist(market)
market = sapply(strsplit(market, split='\n ', fixed=TRUE), `[`)
       [,1]               
  [1,] " Afzalpur"        
  [2,] "Ajjampur"         
  [3,] "Aland"            
  [4,] "Ankola"           
  [5,] "B. Mathikere"     
  [6,] "Bagalkot"         
  [7,] "Bailhongal"       
  [8,] "Bandipur"         
  [9,] "Bangalore"        
 [10,] "Basavakalyan"     
 [11,] "Belgaum"          
 [12,] "Bellary"          
 [13,] "Belthangadi"      
 [14,] "Bhadravathi"      
 [15,] "Bhadravati"       
 [16,] "Bhatkal"          
 [17,] "Bidar"            
 [18,] "Bijapur"          
 [19,] "Challakere"    

CodePudding user response:

You can do easy with RSelenium, but before you have to check all values of the drop menu.

Before creating an example it is necessary to understand that to interact with a drop-down menu you have to study the html code behind the page and to know each elements of the menu.

This is a example:

    library(RSelenium)
    #Selenium environment activation
    rD <- rsDriver(browser = "firefox", check = FALSE)
    remDr <- rD[["client"]]
    remDr$navigate("https://hindi.iocl.com/lpgdistributors.aspx")
    
# We collect all elements that are present in the drop menu and we insert in a list
    menu_1<-list("Assam")
    menu_2<-list("Goalpara")
    menu_3<-list("Goalpara")
    menu_4<-list("Guwahati")
    
# We create a four for loops with two focus:
# 1. by click we active the drop menu
# 2. in this case we sent the text (the format should be factor) to choose
    for( i in menu_1) {
      first_element<-as.factor(i)
      remDr$findElement(using = 'xpath', value = '//*[@id="cmbState"]')$clickElement()
      remDr$findElement(using = 'xpath', value = '//*[@id="cmbState"]')$sendKeysToElement(list(first_element))
      Sys.sleep(1) #Is better to implement a sleep
      for( j in menu_2) {
        first_element<-as.factor(j)
        remDr$findElement(using = 'xpath', value = '//*[@id="cmbDistrict"]')$clickElement()
        remDr$findElement(using = 'xpath', value = '//*[@id="cmbDistrict"]')$sendKeysToElement(list(first_element))
        Sys.sleep(1) #Is better to implement a sleep
        for( e in menu_3) {
          first_element<-as.factor(e)
          remDr$findElement(using = 'xpath', value = '//*[@id="cmbMarket"]')$clickElement()
          remDr$findElement(using = 'xpath', value = '//*[@id="cmbMarket"]')$sendKeysToElement(list(first_element))
          Sys.sleep(1) #Is better to implement a sleep
          for( f in menu_4) {
            first_element<-as.factor(f)
            remDr$findElement(using = 'xpath', value = '//*[@id="cmbArea"]')$clickElement()
            remDr$findElement(using = 'xpath', value = '//*[@id="cmbArea"]')$sendKeysToElement(list(first_element))
            Sys.sleep(1) #Is better to implement a sleep
        }
       }
      }
    }
# Click the button for searching
    remDr$findElement(using = 'xpath', value = '//*[@id="btnSearch"]')$clickElement()
  • Related