Home > Software design >  Loop and Add to a Data frame while Web Scraping with R
Loop and Add to a Data frame while Web Scraping with R

Time:01-30

I was able to extract the data from 1st page of IMDB action movies with the following code:

movies <-
  "https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=1&ref_=adv_nxt" %>%
  read_html()


movies<-data.frame(
  movies %>% 
  html_elements(".lister-item-content") %>% # the cards
  map_dfr(~ tibble( # interate through the list and grab the elements:
    title = .x %>% 
      html_element(".lister-item-header a") %>% 
      html_text2(), 
    year = .x %>% 
      html_element(".text-muted.unbold") %>% 
      html_text2(), 
    certificate = .x %>% 
      html_element(".certificate") %>% 
      html_text2(), 
    runtime = .x %>% 
      html_element(".runtime") %>% 
      html_text2(), 
    genre = .x %>% 
      html_element(".genre") %>% 
      html_text2(), 
    rating = .x %>% 
      html_element(".ratings-imdb-rating strong") %>% 
      html_text2(),
    metascore=.x %>% 
      html_element(".ratings-metascore") %>% 
      html_text2(),
    synopsis = .x %>% 
      html_element(".ratings-bar  .text-muted") %>% 
      html_text2(),
    director = .x %>% 
      html_element(".text-muted  p a:nth-child(1)") %>% 
      html_text2(), 
    votes = .x %>% 
      html_element(".sort-num_votes-visible span:nth-child(2)") %>%  
      html_text2(), 
    gross = .x %>% 
      html_element(".ghost~ .text-muted  span") %>% 
      html_text2(),
    cast1 = .x %>% 
      html_element('a[href*="adv_li_st_0"]') %>% 
      html_text2(),
    cast2 = .x %>% 
      html_element('a[href*="adv_li_st_1"]') %>% 
      html_text2(),
    cast3 = .x %>% 
      html_element('a[href*="adv_li_st_2"]') %>% 
      html_text2(),
    cast4 = .x %>% 
      html_element('a[href*="adv_li_st_3"]') %>% 
      html_text2())))


With the above code I am getting the result of the first 50 movies (page 1).

How can I loop this to all pages (ex: 200 pages) and get the result as a data frame?

CodePudding user response:

library(tidyverse)
library(rvest)

get_imdb <- function(index) {
  cat("Scraping index:", index, "\n")
  movies <-
    str_c(
      "https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=action&sort=user_rating,desc&start=",
      index,
      "&ref_=adv_nxt") %>%
    read_html()
  
  movies %>%
    html_elements(".lister-item-content") %>% # the cards
    map_dfr(
      ~ tibble(
        # interate through the list and grab the elements:
        title = .x %>%
          html_element(".lister-item-header a") %>%
          html_text2(),
        year = .x %>%
          html_element(".text-muted.unbold") %>%
          html_text2(),
        certificate = .x %>%
          html_element(".certificate") %>%
          html_text2(),
        runtime = .x %>%
          html_element(".runtime") %>%
          html_text2(),
        genre = .x %>%
          html_element(".genre") %>%
          html_text2(),
        rating = .x %>%
          html_element(".ratings-imdb-rating strong") %>%
          html_text2(),
        metascore = .x %>%
          html_element(".ratings-metascore") %>%
          html_text2(),
        synopsis = .x %>%
          html_element(".ratings-bar  .text-muted") %>%
          html_text2(),
        director = .x %>%
          html_element(".text-muted  p a:nth-child(1)") %>%
          html_text2(),
        votes = .x %>%
          html_element(".sort-num_votes-visible span:nth-child(2)") %>%
          html_text2(),
        gross = .x %>%
          html_element(".ghost~ .text-muted  span") %>%
          html_text2(),
        cast1 = .x %>%
          html_element('a[href*="adv_li_st_0"]') %>%
          html_text2(),
        cast2 = .x %>%
          html_element('a[href*="adv_li_st_1"]') %>%
          html_text2(),
        cast3 = .x %>%
          html_element('a[href*="adv_li_st_2"]') %>%
          html_text2(),
        cast4 = .x %>%
          html_element('a[href*="adv_li_st_3"]') %>%
          html_text2()
      )
    )
}

df <- map_dfr(seq(1, 1711, by = 50), get_imdb)

   title      year  certificate runtime genre rating metascore synopsis director votes gross
   <chr>      <chr> <chr>       <chr>   <chr> <chr>  <chr>     <chr>    <chr>    <chr> <chr>
 1 The Dark … (200… 15          152 min Acti… 9.0    84 Metas… "When t… Christo… 2,66… $534…
 2 Ringenes … (200… 12          201 min Acti… 9.0    94 Metas… "Gandal… Peter J… 1,85… $377…
 3 Inception  (201… 15          148 min Acti… 8.8    74 Metas… "A thie… Christo… 2,36… $292…
 4 Ringenes … (200… 12          178 min Acti… 8.8    92 Metas… "A meek… Peter J… 1,88… $315…
 5 Ringenes … (200… 12          179 min Acti… 8.8    87 Metas… "While … Peter J… 1,67… $342…
 6 The Matrix (199… 15          136 min Acti… 8.7    73 Metas… "When a… Lana Wa… 1,92… $171…
 7 Star Wars… (198… 9           124 min Acti… 8.7    82 Metas… "After … Irvin K… 1,29… $290…
 8 Soorarai … (202… NA          153 min Acti… 8.7    NA        "Neduma… Sudha K… 117,… NA   
 9 Stjernekr… (197… 11          121 min Acti… 8.6    90 Metas… "Luke S… George … 1,37… $322…
10 Terminato… (199… 15          137 min Acti… 8.6    75 Metas… "A cybo… James C… 1,10… $204…
  • Related