Home > database >  Webscraping between two tags
Webscraping between two tags

Time:07-27

I am trying to scrape the following page(s):

Htpps://mywebsite.com

In particular, I would like to get the name of each entry. I noticed that the text I am interested in is always in (MY TEXT) the middle of these two tags: <div > <a href="your text"> MY TEXT </a>

I know how to search for these tags individually:

#load libraries 
library(rvest)
library(httr)
library(XML)
library(rvest)

# set up page
url<-"https://www.mywebsite.com"
page <-read_html(url)

#option 1
b = page %>% html_nodes("title")

option1 <- b %>% html_text() %>% strsplit("\\n")

#option 2
b = page %>% html_nodes("a")

option2 <- b %>% html_text() %>% strsplit("\\n")

Is there some way that I could have specified the "html_nodes" argument so that it picked up on "MY TEXT" - i.e. scrape between <div > and </a> :

 <div > <a href="your text"> MY TEXT </a>

CodePudding user response:

Scraping of pages 1:10

library(tidyverse)
library(rvest)

get_dental <- function(page_n) {
  
  cat("Scraping page ", page_n, "\n")
  
  page <- paste0("https://www.dentistsearch.ca/search-doctor/",
    page_n, "?category=0&services=0&province=55&city=&k=") %>% read_html
  
  tibble(title = page %>%
           html_elements(".title a") %>%
           html_text2(),
         adress = page %>%  
           html_elements(".marker") %>% 
           html_text2(),
         page = page_n)
}

df <- map_dfr(1:10, get_dental)

# A tibble: 90 x 3
   title                              adress                page
   <chr>                              <chr>                <int>
 1 Altima Dental Centres              201-535 Bloor St W,~     1
 2 Dawson Dental Centre               100A-120 Bloor St E~     1
 3 Pape Dental Centre                 759 Pape Ave, Toron~     1
 4 Pape Finch Dental                  207-717 Pape Ave, T~     1
 5 Bloor SMile Dental - Dr Irma Kacur 2436A Bloor St W, T~     1
 6 Midtown Dental Centre              20 Bloor St E, Toro~     1
 7 Bloor West Smiles                  505-2 Jane St, Toro~     1
 8 DentalHouse                        28 Joe Shuster Way,~     1
 9 The Village Dentist                750 Annette St, Tor~     1
10 Commerce Court Dental Centre       199 Bay St, Toronto~     2
# ... with 80 more rows

CodePudding user response:

You can use the xpath argument inside html_elements to locate each a tag inside a div with class "title".

Here's a complete reproducible example.

library(rvest)

"https://www.dentistsearch.ca/search-doctor/" %>%
  paste0("2?category=0&services=0&province=55&city=&k=") %>%
  read_html() %>%
  html_elements(xpath = "//div[@class='title']/a") %>% 
  html_text()
#> [1] "Commerce Court Dental Centre"                     
#> [2] "Dental Emergency Services"                        
#> [3] "DentalHouse"                                      
#> [4] "Dr Jeff Shnall Beech Dental"                      
#> [5] "Dr Marvin Obar"                                   
#> [6] "Dr Louis Vandersluis"                             
#> [7] "Orthodontics on Danforth"                         
#> [8] "Toronto Beaches Dentist"                          
#> [9] "Smilecare Midtown Dentistry by Dr Koo & Dr Newman"

Or to get all entries on the first 10 pages:

library(rvest)

unlist(lapply(1:10, function(page){
"https://www.dentistsearch.ca/search-doctor/" %>%
  paste0(page, "?category=0&services=0&province=55&city=&k=") %>%
  read_html() %>%
  html_elements(xpath = "//div[@class='title']/a") %>% 
  html_text()}))
#>  [1] "Altima Dental Centres"                            
#>  [2] "Dawson Dental Centre"                             
#>  [3] "Pape Dental Centre"                               
#>  [4] "Pape Finch Dental"                                
#>  [5] "Bloor SMile Dental - Dr Irma Kacur"               
#>  [6] "Midtown Dental Centre"                            
#>  [7] "Bloor West Smiles"                                
#>  [8] "DentalHouse"                                      
#>  [9] "The Village Dentist"                              
#> [10] "Commerce Court Dental Centre"                     
#> [11] "Dental Emergency Services"                        
#> [12] "DentalHouse"                                      
#> [13] "Dr Jeff Shnall Beech Dental"                      
#> [14] "Dr Marvin Obar"                                   
#> [15] "Dr Louis Vandersluis"                             
#> [16] "Orthodontics on Danforth"                         
#> [17] "Toronto Beaches Dentist"                          
#> [18] "Smilecare Midtown Dentistry by Dr Koo & Dr Newman"
#> [19] "Dr Mohsen F Assaad"                               
#> [20] "Dr Sterling M"                                    
#> [21] "Strugurescu M Dr"                                 
#> [22] "Eglinton West Dental"                             
#> [23] "The Art of Dentistry"                             
#> [24] "Royal Bank Plaza Dental Centre"                   
#> [25] "Sturm Judy Dr"                                    
#> [26] "City Oasis Dental"                                
#> [27] "Dr Monica Cobzac"                                 
#> [28] "Solomon Jeff Dr"                                  
#> [29] "Dr Charles Goldberg"                              
#> [30] "Train Leslie Dr"                                  
#> [31] "Dr Richard Styka"                                 
#> [32] "Dr. M. L. Bygrave"                                
#> [33] "Karim Minaz Dr"                                   
#> [34] "Kodama Randall Dr"                                
#> [35] "St-Clair Perio"                                   
#> [36] "Magus Teresa Dr"                                  
#> [37] "Dr. Simeon R. Minkov DDS"                         
#> [38] "Dentistry on Danforth"                            
#> [39] "Anthony Lee"                                      
#> [40] "Curity Dental Care"                               
#> [41] "Granowska & Bailey Dental"                        
#> [42] "Train Leslie Dr"                                  
#> [43] "Steven Stern"                                     
#> [44] "Dr Mark J Litvack"                                
#> [45] "Dr. Angelos Metaxas"                              
#> [46] "Dr Paul M Piccininni"                             
#> [47] "Endodontic Specialists"                           
#> [48] "Bathurst Walk In Dental Centre"                   
#> [49] "Cabbagetown Dental Centre"                        
#> [50] "The Plaza Dental Centre"                          
#> [51] "Yonge Eglinton Periodontics & Implant Surgery"    
#> [52] "Finkelstein Dr Norman"                            
#> [53] "Endodontic Specialists"                           
#> [54] "Dagys Arlene P Dr"                                
#> [55] "Dr Robert Elia"                                   
#> [56] "Royal Bank Plaza Dental Centre"                   
#> [57] "Bob Adler Dentistry Professional Corp Dr"         
#> [58] "Nguyen Thuy Dr"                                   
#> [59] "Dr Mark Nusinoff"                                 
#> [60] "Dentistry @1881"                                  
#> [61] "Speers R D Dr"                                    
#> [62] "Dr Brian Friedman"                                
#> [63] "Dr Paul Wright"                                   
#> [64] "Cedarbrae & Danforth Orthodontic Centres"         
#> [65] "Forest Hill Village Orthodontics"                 
#> [66] "Dr A R Banack Orthodontist"                       
#> [67] "Greenbaum N R Dr"                                 
#> [68] "St. Clair Dental Associates"                      
#> [69] "Kostiner Gary Dr"                                 
#> [70] "Dentistry On Wellington"                          
#> [71] "Dr Jennifer Kotzer"                               
#> [72] "Lombard dental Dr. Kerry Lim"                     
#> [73] "Mazzatto Anthony Dr"                              
#> [74] "Dr Violeta Danciu"                                
#> [75] "Monczka Paul S Dr"                                
#> [76] "Dr. S. B. Kazim"                                  
#> [77] "Jarvis Krystyna Dr"                               
#> [78] "City Oasis Dental"                                
#> [79] "Arch Liberty Village Dental"                      
#> [80] "Dr Raffi Aynaciyan"                               
#> [81] "Pulec B & Associates Dr"                          
#> [82] "Bond Street Dental Implants Toronto"              
#> [83] "Grushka Miriam Dr"                                
#> [84] "Dr David Farkouh & Friends"                       
#> [85] "Brian S Friedman DDS"                             
#> [86] "Balsam Dental"                                    
#> [87] "Niwong Susanne Dr Orthodontist"                   
#> [88] "Dr. Green & Dr. Solomon"                          
#> [89] "Dr Yolanda C Cruz"                                
#> [90] "Cadas Chris P Dr"

Created on 2022-07-26 by the reprex package (v2.0.1)

  • Related