I am new to R and I am hoping to get some help. I am trying to scrape data from a website about dog breeds.
The link to the list of breeds is here: https://dogtime.com/dog-breeds/profiles
The url for each individual breed profile has a base of https://dogtime.com/dog-breeds/ with the breed name then added on (e.g https://dogtime.com/dog-breeds/golden-retriever).
I have successfully scraped the data for one breed using the below code, but i would now like to collect the data for all 392 breeds on the site and store the results in a data frame.
library(rvest)
library(dplyr)
library(purrr)
# Create a vector of URLs
dog_links <- page %>% html_nodes(".list-item-title") %>%
html_attr("href")
# Create a new variable for the website link
link = "https://dogtime.com/dog-breeds/golden-retriever"
# Get HTML code from this website
page <- read_html(link)
# Create variables for each of the attributes
breed <- page %>% html_nodes("h1") %>% html_text()
adaptability = page %>% html_nodes(".title-box .paws .parent-characteristic .characteristic-star-block") %>% html_text()
apartment_living = page %>% html_nodes(".title-box .paws .parent-characteristic .child-characteristic .characteristic-star-block") %>% html_text()
novice_owners = page %>% html_nodes(".title-box .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
sensitivity_level = page %>% html_nodes(".title-box .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
tolerates_alone = page %>% html_nodes(".title-box .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
tolerates_cold = page %>% html_nodes(".title-box .paws .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
tolerates_hot = page %>% html_nodes(".title-box .paws .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
friendliness = page %>% html_nodes(".paws:nth-child(3) .parent-characteristic .characteristic-star-block") %>% html_text()
affectionate = page %>% html_nodes(".paws:nth-child(3) .parent-characteristic .child-characteristic .characteristic-star-block") %>% html_text()
kid_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
dog_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
stranger_friendly = page %>% html_nodes(".paws:nth-child(3) .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
health_grooming = page %>% html_nodes(".paws:nth-child(4) .parent-characteristic .characteristic-star-block") %>% html_text()
shedding = page %>% html_nodes(".paws:nth-child(4) .parent-characteristic .child-characteristic .characteristic-star-block") %>% html_text()
drooling = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
easy_groom = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
general_health = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text
weight_gain = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
size = page %>% html_nodes(".paws:nth-child(4) .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
trainability = page %>% html_nodes("#cf_hagn .paws .parent-characteristic .characteristic-star-block") %>% html_text()
easy_train = page %>% html_nodes("#cf_hagn .paws .parent-characteristic .child-characteristic .characteristic-star-block") %>% html_text()
intelligence = page %>% html_nodes("#cf_hagn .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
mouthiness = page %>% html_nodes("#cf_hagn .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
prey_drive = page %>% html_nodes("#cf_hagn .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
barking = page %>% html_nodes("#cf_hagn .paws .child-characteristic:nth-child(6) .characteristic-star-block") %>% html_text()
wanderlust = page %>% html_nodes("#cf_hagn .paws .child-characteristic:nth-child(7) .characteristic-star-block") %>% html_text()
physical_needs = page %>% html_nodes("#cf_hagn~ .paws .paws .parent-characteristic .characteristic-star-block") %>% html_text()
energy_level = page %>% html_nodes("#cf_hagn~ .paws .paws .parent-characteristic .child-characteristic .characteristic-star-block") %>% html_text()
intensity = page %>% html_nodes("#cf_hagn~ .paws .paws .child-characteristic:nth-child(3) .characteristic-star-block") %>% html_text()
exercise_needs = page %>% html_nodes("#cf_hagn~ .paws .paws .child-characteristic:nth-child(4) .characteristic-star-block") %>% html_text()
playfulness = page %>% html_nodes("#cf_hagn~ .paws .paws .child-characteristic:nth-child(5) .characteristic-star-block") %>% html_text()
breed_group = page %>% html_nodes(".vital-stat-box:nth-child(1)") %>% html_text()
height = page %>% html_nodes(".vital-stat-box:nth-child(2)") %>% html_text()
weight = page %>% html_nodes(".vital-stat-box:nth-child(3)") %>% html_text()
life_span = page %>% html_nodes(".vital-stat-box:nth-child(4)") %>% html_text()
# Create a data frame
dogs = data.frame(breed, adaptability, apartment_living, novice_owners, sensitivity_level, tolerates_alone, tolerates_cold, tolerates_hot, friendliness, affectionate, kid_friendly, dog_friendly, stranger_friendly, health_grooming, shedding, drooling, easy_groom, general_health, weight_gain, size, trainability, easy_train, intelligence, mouthiness, prey_drive, barking, wanderlust, physical_needs, energy_level, intensity, exercise_needs, playfulness, breed_group, height, weight, life_span, stringsAsFactors = FALSE)
# view data frame
View(dogs)
Sorry, there are quite a few variables to store in the code. I imagine that I will need to use a for loop to run through each different url for the individual breeds but I am not sure how I would write this given the 'i' values are characters and not numbers.
Can anyone advise whether this is the best method and if so, how i would achieve this?
Many thanks in advance for your help,
James
CodePudding user response:
If you are happy with the code for the golden retriever, this will give you a character vector of all dogs:
dogs <- page %>% html_nodes(".list-item") %>% html_text()
which you can then paste as follows:
dog_urls <- paste0("https://dogtime.com/dog-breeds/" , dogs) and use your existing code in a loop to recover all of the dogs.
CodePudding user response:
We can just use html_attr('href')
to get the links for all the dog breeds,
library(rvest)
library(dplyr)
url = url = 'https://dogtime.com/dog-breeds/profiles'
url %>% read_html() %>%
html_nodes(".list-item-img") %>%
html_attr('href')
Output
[1] "https://dogtime.com/dog-breeds/afador"
[2] "https://dogtime.com/dog-breeds/affenhuahua"
[3] "https://dogtime.com/dog-breeds/affenpinscher"
[4] "https://dogtime.com/dog-breeds/afghan-hound"
[5] "https://dogtime.com/dog-breeds/airedale-terrier"
You can loop over the links with you code.
Furthermore, I would suggest you to use class
to get the data as it reduces large chunk of code to small one,
url = "https://dogtime.com/dog-breeds/golden-retriever"
url %>% read_html() %>%
html_nodes(".characteristic-title") %>%
html_text()
[1] " Adaptability" "Adapts Well To Apartment Living" "Good For Novice Owners"
[4] "Sensitivity Level" "Tolerates Being Alone" "Tolerates Cold Weather"
[7] "Tolerates Hot Weather" " All Around Friendliness" "Affectionate With Family"
[10] "Kid-Friendly" "Dog Friendly" "Friendly Toward Strangers"
[13] " Health And Grooming Needs" "Amount Of Shedding" "Drooling Potential"
[16] "Easy To Groom" "General Health" "Potential For Weight Gain"
[19] "Size" " Trainability" "Easy To Train"
[22] "Intelligence" "Potential For Mouthiness" "Prey Drive"
[25] "Tendency To Bark Or Howl" "Wanderlust Potential" " Physical Needs"
[28] "Energy Level" "Intensity" "Exercise Needs"
[31] "Potential For Playfulness"
url %>% read_html() %>%
html_nodes(".characteristic-star-block") %>% html_nodes('.star') %>%
html_text()
[1] "" "2" "3" "5" "1" "3" "3" "" "5" "5" "5" "5" "" "5" "4" "2" "2" "5" "3" "" "5" "5" "5" "3" "3" "2" "" "5" "2" "5" "5"