I am currently working on one of the sites https://www.pmjdy.gov.in/Archive from which first I have to enter a date(has to be a Wednesday) and then the search result opens up into a webpage which has data stored in it with table embedded on web-page itself. Can I somehow webscrape this whole data and store it in an excel document. I have written the following code however it is erroring out. I think this might be because the this link whenever it opens up requires human input of entering any Wednesday date from calendar to get report. Can I automate this process wherein I just give input of the day in the code itself and then the data gets downloaded. Would be of great help. Thanks :)
library(RSelenium)
library(rvest)
library(tidyverse)
library(stringr)
library(purrr)
rD <- rsDriver(browser="firefox", port=4567L, verbose=F)
remDr <- rD[["client"]]
#go to the site
remDr$navigate("https://www.pmjdy.gov.in/Archive")
#get tables
tables <- remDr$findElements('class', 'table-container')
tableList <- list()
for(i in 1:length(tables)){
x <- tables[[i]]$getElementAttribute('innerHTML') %>%
unlist() %>%
read_html() %>%
html_table()
tableList[[i]] <- x[[1]]
}
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
CodePudding user response:
No need of human intervention, we can input date using sendKeysToElement
.
#load packages
library(RSelenium)
library(rvest)
library(dplyr)
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
#navigate
url <- 'https://www.pmjdy.gov.in/Archive'
remDr$navigate(url)
#input date
date_element <- remDr$findElement(using = 'xpath', value = '//*[@id="ContentPlaceHolder1_txtdate"]')
date_element$sendKeysToElement(list('23/02/2022'))
#click get report
remDr$findElement(using = "xpath",'//*[@id="ContentPlaceHolder1_btngetreport"]')$clickElement()
#get the table
df = remDr$getPageSource()[[1]] %>%
read_html() %>% html_nodes(xpath = '//*[@id="ContentPlaceHolder1_tblHTMLReport"]/tbody') %>%
html_table()
Remove NA
columns from df
df1 = df[[1]]
colMeans(is.na(df1)) > .50
final <- df1[, colMeans(is.na(df1)) <= .15])
Export data to excel file
library("writexl")
write_xlsx(final,"report.xlsx"
#you can also use xlsx package