Home > OS >  Web scraping multiple pages in R
Web scraping multiple pages in R

Time:12-19

I am new to R and I am hoping to get some help. I am trying to scrape data from a website about dog breeds.

The link to the list of breeds is here: https://dogtime.com/dog-breeds/profiles

The url for each individual breed profile has a base of https://dogtime.com/dog-breeds/ with the breed name then added on (e.g https://dogtime.com/dog-breeds/golden-retriever).

I have successfully scraped the data for one breed using the below code, but i would now like to collect the data for all 392 breeds on the site and store the results in a data frame.

library(rvest)
library(dplyr)
library(purrr)

# Create a vector of URLs
dog_links <- page %>% html_nodes(".list-item-title") %>%
  html_attr("href") 

# Create a new variable for the website link
link = "https://dogtime.com/dog-breeds/golden-retriever"
 
# Get HTML code from this website
page <- read_html(link)

# Create variables for each of the attributes
breed <- page %>% html_nodes("h1") %>% html_text()
adaptability = page %>% html_nodes(".title-box  .paws .parent-characteristic .characteristic-star-block") %>% html_text()
apartment_living = page %>% html_nodes(".title-box  .paws .parent-characteristic  .child-characteristic .characteristic-star-block") %>% html_text()
novice_owners = page %>% html_nodes(".title-box  .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
sensitivity_level = page %>% html_nodes(".title-box  .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
tolerates_alone = page %>% html_nodes(".title-box  .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
tolerates_cold = page %>% html_nodes(".title-box  .paws .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
tolerates_hot = page %>% html_nodes(".title-box  .paws .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
friendliness = page %>% html_nodes(".paws:nth-child(3) .parent-characteristic .characteristic-star-block") %>% html_text()
affectionate = page %>% html_nodes(".paws:nth-child(3) .parent-characteristic  .child-characteristic .characteristic-star-block") %>% html_text()
kid_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
dog_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
stranger_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
health_grooming = page %>% html_nodes(".paws:nth-child(4) .parent-characteristic .characteristic-star-block") %>% html_text()
shedding = page %>% html_nodes(".paws:nth-child(4) .parent-characteristic  .child-characteristic .characteristic-star-block") %>% html_text()
drooling = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
easy_groom = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
general_health = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text
weight_gain = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
size = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
trainability = page %>% html_nodes("#cf_hagn  .paws .parent-characteristic .characteristic-star-block") %>% html_text()
easy_train = page %>% html_nodes("#cf_hagn  .paws .parent-characteristic  .child-characteristic .characteristic-star-block") %>% html_text()
intelligence = page %>% html_nodes("#cf_hagn  .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
mouthiness = page %>% html_nodes("#cf_hagn  .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
prey_drive = page %>% html_nodes("#cf_hagn  .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
barking = page %>% html_nodes("#cf_hagn  .paws .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
wanderlust = page %>% html_nodes("#cf_hagn  .paws .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
physical_needs = page %>% html_nodes("#cf_hagn~ .paws  .paws .parent-characteristic .characteristic-star-block") %>% html_text()
energy_level = page %>% html_nodes("#cf_hagn~ .paws  .paws .parent-characteristic  .child-characteristic .characteristic-star-block") %>% html_text()
intensity = page %>% html_nodes("#cf_hagn~ .paws  .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
exercise_needs = page %>% html_nodes("#cf_hagn~ .paws  .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
playfulness = page %>% html_nodes("#cf_hagn~ .paws  .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
breed_group = page %>% html_nodes(".vital-stat-box:nth-child(1)") %>% html_text()
height = page %>% html_nodes(".vital-stat-box:nth-child(2)") %>% html_text()
weight = page %>% html_nodes(".vital-stat-box:nth-child(3)") %>% html_text()
life_span = page %>% html_nodes(".vital-stat-box:nth-child(4)") %>% html_text() 

# Create a data frame
dogs = data.frame(breed, adaptability, apartment_living, novice_owners, sensitivity_level, tolerates_alone, tolerates_cold, tolerates_hot, friendliness, affectionate, kid_friendly, dog_friendly, stranger_friendly, health_grooming, shedding, drooling, easy_groom, general_health, weight_gain, size, trainability, easy_train, intelligence, mouthiness, prey_drive, barking, wanderlust, physical_needs, energy_level, intensity, exercise_needs, playfulness, breed_group, height, weight, life_span, stringsAsFactors = FALSE)

# view data frame
View(dogs)

Sorry, there are quite a few variables to store in the code. I imagine that I will need to use a for loop to run through each different url for the individual breeds but I am not sure how I would write this given the 'i' values are characters and not numbers.

Can anyone advise whether this is the best method and if so, how i would achieve this?

Many thanks in advance for your help,

James

CodePudding user response:

If you are happy with the code for the golden retriever, this will give you a character vector of all dogs:

dogs <- page %>% html_nodes(".list-item") %>% html_text()

which you can then paste as follows:

dog_urls <- paste0("https://dogtime.com/dog-breeds/" , dogs) and use your existing code in a loop to recover all of the dogs.

CodePudding user response:

We can just use html_attr('href') to get the links for all the dog breeds,

library(rvest)
library(dplyr)

url = url = 'https://dogtime.com/dog-breeds/profiles'
url %>% read_html() %>% 
  html_nodes(".list-item-img") %>%
  html_attr('href')

Output

  [1] "https://dogtime.com/dog-breeds/afador"                            
  [2] "https://dogtime.com/dog-breeds/affenhuahua"                       
  [3] "https://dogtime.com/dog-breeds/affenpinscher"                     
  [4] "https://dogtime.com/dog-breeds/afghan-hound"                      
  [5] "https://dogtime.com/dog-breeds/airedale-terrier" 

You can loop over the links with you code.

Furthermore, I would suggest you to use class to get the data as it reduces large chunk of code to small one,

url = "https://dogtime.com/dog-breeds/golden-retriever"
url %>% read_html() %>% 
  html_nodes(".characteristic-title") %>%
  html_text()
 [1] " Adaptability"                   "Adapts Well To Apartment Living" "Good For Novice Owners"         
 [4] "Sensitivity Level"               "Tolerates Being Alone"           "Tolerates Cold Weather"         
 [7] "Tolerates Hot Weather"           " All Around Friendliness"        "Affectionate With Family"       
[10] "Kid-Friendly"                    "Dog Friendly"                    "Friendly Toward Strangers"      
[13] " Health And Grooming Needs"      "Amount Of Shedding"              "Drooling Potential"             
[16] "Easy To Groom"                   "General Health"                  "Potential For Weight Gain"      
[19] "Size"                            " Trainability"                   "Easy To Train"                  
[22] "Intelligence"                    "Potential For Mouthiness"        "Prey Drive"                     
[25] "Tendency To Bark Or Howl"        "Wanderlust Potential"            " Physical Needs"                
[28] "Energy Level"                    "Intensity"                       "Exercise Needs"                 
[31] "Potential For Playfulness"  



url %>% read_html() %>% 
  html_nodes(".characteristic-star-block") %>% html_nodes('.star') %>% 
  html_text()

[1] ""  "2" "3" "5" "1" "3" "3" ""  "5" "5" "5" "5" ""  "5" "4" "2" "2" "5" "3" ""  "5" "5" "5" "3" "3" "2" ""  "5" "2" "5" "5"
  • Related