I am learning web scraping and have been facing one hurdle after another. I want to create a data frame full of the first table on this page for all portfolio managers, for the month of august, the year 2022.
So far, I have found a way to scrape a single table properly (I think! Please let me know if I can improve on this).
I haven't been able to bind all the tables into a data frame properly, also I wanted to find out if there is a way to transform this form type data into a proper data frame with the 1st column of every table as the variable and the second column as the row (I know I can use the usual data wrangling thing but I wanted to know if some function helped transform this form type data into a data frame).
> library(tidyverse)
> library(rvest)
> library(httr)
> url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
> pm_id <- read_html(url) %>%
html_elements('select[name="pmrId"].f_control option') %>%
html_attr("value")
> pm_id <- pm_id[2:416]
> sebi_pm <- function(x) {
resp = POST(url,
body = list(
pmrId= x,
year="2022",
m .... [TRUNCATED]
> #s <- lapply(pm_id[i], sebi_pm)
> #v <- sebi_pm(pm_id[1])
> #v
> #do.call() lapply(pm_id[1:5], sebi_pm)
> ha <- do.call("rbind", lapply(pm_id, sebi_ .... [TRUNCATED]
#> Error in .[[1]] : subscript out of bounds
CodePudding user response:
Normally I would be a stickler for a reproducible example, but I think I know what you're getting at here... try this...
# DEPENDENCIES -----------------------------------------------------------------
library(rvest)
library(httr)
library(stringr)
library(data.table)
# UTILITY FUNCTIONS ------------------------------------------------------------
get_pm_ids <- function() {
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
# get list of portfolio manager ids
pm_ids <- read_html(url) |>
html_elements('select[name="pmrId"].f_control option') |>
html_attr('value')
pm_ids
}
get_monthly_report <- function(pmr_id, report_year, report_month) {
msg <- sprintf('fetching report for portfolio manager: %s; year = %s; month = %s',
str_split(pmr_id, '@@', simplify = TRUE)[ , 3] |> str_squish(),
report_year,
report_month)
message(msg)
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
params <- list(
currdate = '',
loginflag = 0,
searchValue = '',
pmrId = pmr_id,
year = report_year,
month = report_month,
loginEmail = '',
loginPassword = '',
cap_login = '',
moduleNo = -1,
moduleId = '',
link = '',
yourName = '',
friendName = '',
friendEmail = '',
mailmessage = '',
cap_email = ''
)
resp <- POST(url, body = params)
pg <- httr::content(resp)
tbl <- html_nodes(pg, 'div.portlet:nth-child(3) > div:nth-child(1) > table:nth-child(1)')
result_df <- data.frame()
if (length(tbl) == 0) {
# no records found
result_df <- data.frame(id = pmr_id,
report_year = report_year,
report_month = report_month)
} else {
tr <- html_nodes(tbl, 'tr')
cell_captions <- lapply(tr, html_children) |> lapply('[', 1) |> lapply(html_text) |> unlist()
cell_contents <- lapply(tr, html_children) |> lapply('[', 2) |> lapply(html_text) |> unlist()
result_df <- data.frame(t(cell_contents))
colnames(result_df) <- cell_captions
result_df$id <- pmr_id
result_df$report_year <- report_year
result_df$report_month <- report_month
}
return(result_df)
}
# MAIN -------------------------------------------------------------------------
## 1. fetch list of portfolio manager ids --------------------------------------
pm_ids <- get_pm_ids()
## 2. filter list of portfolio manager ids -------------------------------------
pm_ids <- pm_ids[ 2:416 ]
## 3. testing: fetch reports for a sample of managers in January 2022 ----------
set.seed(1234)
tmp <- sample(pm_ids, 5)
reports_list <- lapply(tmp, get_monthly_report, 2022, 1)
## 4. combine the results ------------------------------------------------------
reports_df <- rbindlist(reports_list, use.names = TRUE, fill = TRUE) |>
as.data.frame()
## 5. inspect results ----------------------------------------------------------
View(reports_df, 'downloaded reports')
This code could be improved by providing some kind of input validation and more robust error handling. Hope this helps!