I am trying to scrape the first 200 entries from https://www.ssrn.com/index.cfm/en/arn/?page=1&sort=0 (title, authors, url, ...). I used rvest so far (which worked fine looping over the first 4 pages until this week), and try now to scrape json directly from https://api.ssrn.com/content/v1/bindings/204/papers. Code works fine (see below), but I don't know how to get more than the first 50 entries, or even display more than 50 entries (out of 43602). Any solution using jsonlite or rvest?
Any help appreciated! Thanks in advance.
library(jsonlite)
json_file <- "https://api.ssrn.com/content/v1/bindings/204/papers"
data <- fromJSON(json_file)
data <- as.data.frame(data)
CodePudding user response:
If you look at the link, you can alter the out parameters count
per index
. The max output is 200 per index, then map over the sequence of index to get all 43602 entries like so (2-3 min scraping time):
library(tidyverse)
library(httr2)
get_ssrn <- function(index) {
cat("Scraping index:", index, "\n")
str_c("https://api.ssrn.com/content/v1/bindings/204/papers?index=",
index, "&count=200&sort=0") %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
pluck("papers") %>%
as_tibble()
}
df <- map_dfr(seq(0, 43602, by = 200), get_ssrn)
df
# A tibble: 43,602 × 13
abstract_…¹ publi…² is_paid refer…³ page_…⁴ title authors affil…⁵ id is_ap…⁶ appro…⁷ downl…⁸
<chr> <chr> <lgl> <chr> <int> <chr> <list> <chr> <int> <lgl> <chr> <int>
1 Working Pa… UNDER … FALSE "" 68 "Is … <df> "Conco… 4.33e6 TRUE 20 Jan… 27
2 Working Pa… UNDER … FALSE "" 58 "The… <df> "Unive… 4.33e6 TRUE 20 Jan… 14
3 Working Pa… UNDER … FALSE "" 7 "App… <df> "Atma … 4.33e6 TRUE 20 Jan… 2
4 Working Pa… UNDER … FALSE "" 7 "The… <df> "Atmaj… 4.33e6 TRUE 20 Jan… 2
5 Working Pa… UNDER … FALSE "Afric… 0 "Mer… <df> "Indep… 4.33e6 TRUE 20 Jan… 0
6 Working Pa… UNDER … FALSE "" 22 "Siz… <df> "Unive… 4.33e6 TRUE 20 Jan… 2
7 Accepted P… UNDER … FALSE "Finan… 0 "Bud… <df> "Norwe… 4.33e6 TRUE 20 Jan… 0
8 Working Pa… UNDER … FALSE "Journ… 6 "Fac… <df> "Open … 4.33e6 TRUE 20 Jan… 2
9 Working Pa… UNDER … FALSE "" 34 "Soc… <df> "Unive… 4.33e6 TRUE 20 Jan… 1
10 Working Pa… UNDER … FALSE "Manag… 0 "Aud… <df> "Chu H… 4.33e6 TRUE 20 Jan… 0
# … with 43,592 more rows, 1 more variable: url <chr>, and abbreviated variable names
# ¹abstract_type, ²publication_status, ³reference, ⁴page_count, ⁵affiliations, ⁶is_approved,
# ⁷approved_date, ⁸downloads
CodePudding user response:
Keeping papers and authors in 2 separate tables:
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
library(purrr)
MAX_COUNT <- 200
api_templ <- "https://api.ssrn.com/content/v1/bindings/204/papers?index={start_idx}&count={count}&sort=0"
# get the first set and total number of papers
resp <- read_json(str_glue(api_templ, start_idx = 0, count = MAX_COUNT), simplifyVector = T)
resp$total
#> [1] 43602
# overwrite to limit requests while testing
resp$total <- 600
papers <- map_df(seq(MAX_COUNT, resp$total, MAX_COUNT),
~ read_json(str_glue(api_templ, start_idx = .x, count = MAX_COUNT), simplifyVector = T)$papers)
# add papers from the first response
papers <- bind_rows(resp$papers, papers)
# authors are in nested tables, unnest while keeping papers id
authors <- papers %>% select(id_paper = id, authors) %>%
unnest(authors) %>%
rename(id_author = id, url_author = url)
papers <- papers %>% select(-authors) %>% as_tibble()
Result :
head(papers)
#> # A tibble: 6 × 12
#> abstrac…¹ publi…² is_paid refer…³ page_…⁴ title affil…⁵ id is_ap…⁶ appro…⁷
#> <chr> <chr> <lgl> <chr> <int> <chr> <chr> <int> <lgl> <chr>
#> 1 Working … UNDER … FALSE "" 68 Is B… Concor… 4.33e6 TRUE 20 Jan…
#> 2 Working … UNDER … FALSE "" 58 The … Univer… 4.33e6 TRUE 20 Jan…
#> 3 Working … UNDER … FALSE "" 7 Appl… Atma J… 4.33e6 TRUE 20 Jan…
#> 4 Working … UNDER … FALSE "" 7 The … Atmaja… 4.33e6 TRUE 20 Jan…
#> 5 Working … UNDER … FALSE "Afric… 0 Merg… Indepe… 4.33e6 TRUE 20 Jan…
#> 6 Working … UNDER … FALSE "" 22 Size… Univer… 4.33e6 TRUE 20 Jan…
#> # … with 2 more variables: downloads <int>, url <chr>, and abbreviated variable
#> # names ¹abstract_type, ²publication_status, ³reference, ⁴page_count,
#> # ⁵affiliations, ⁶is_approved, ⁷approved_date
head(authors)
#> # A tibble: 6 × 5
#> id_paper id_author last_name first_name url_author
#> <int> <int> <chr> <chr> <chr>
#> 1 4330623 643676 Proelss Juliane https://papers.ssrn.com/sol3/cf_dev/…
#> 2 4330623 744422 Schweizer Denis https://papers.ssrn.com/sol3/cf_dev/…
#> 3 4330623 3518984 Sevigny Stephane https://papers.ssrn.com/sol3/cf_dev/…
#> 4 4330532 1530510 Cunningham Lauren M. https://papers.ssrn.com/sol3/cf_dev/…
#> 5 4330532 1452555 Hayne Christie https://papers.ssrn.com/sol3/cf_dev/…
#> 6 4330532 51250 Neal Terry L. https://papers.ssrn.com/sol3/cf_dev/…
# join tables if needed:
left_join(papers, authors, by = c("id" = "id_paper"))
#> # A tibble: 1,903 × 16
#> abstra…¹ publi…² is_paid refer…³ page_…⁴ title affil…⁵ id is_ap…⁶ appro…⁷
#> <chr> <chr> <lgl> <chr> <int> <chr> <chr> <int> <lgl> <chr>
#> 1 Working… UNDER … FALSE "" 68 Is B… Concor… 4.33e6 TRUE 20 Jan…
#> 2 Working… UNDER … FALSE "" 68 Is B… Concor… 4.33e6 TRUE 20 Jan…
#> 3 Working… UNDER … FALSE "" 68 Is B… Concor… 4.33e6 TRUE 20 Jan…
#> 4 Working… UNDER … FALSE "" 58 The … Univer… 4.33e6 TRUE 20 Jan…
#> 5 Working… UNDER … FALSE "" 58 The … Univer… 4.33e6 TRUE 20 Jan…
#> 6 Working… UNDER … FALSE "" 58 The … Univer… 4.33e6 TRUE 20 Jan…
#> 7 Working… UNDER … FALSE "" 58 The … Univer… 4.33e6 TRUE 20 Jan…
#> 8 Working… UNDER … FALSE "" 7 Appl… Atma J… 4.33e6 TRUE 20 Jan…
#> 9 Working… UNDER … FALSE "" 7 The … Atmaja… 4.33e6 TRUE 20 Jan…
#> 10 Working… UNDER … FALSE "Afric… 0 Merg… Indepe… 4.33e6 TRUE 20 Jan…
#> # … with 1,893 more rows, 6 more variables: downloads <int>, url <chr>,
#> # id_author <int>, last_name <chr>, first_name <chr>, url_author <chr>, and
#> # abbreviated variable names ¹abstract_type, ²publication_status, ³reference,
#> # ⁴page_count, ⁵affiliations, ⁶is_approved, ⁷approved_date
Created on 2023-01-22 with reprex v2.0.2