I am trying to collect few information from https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId= using python selenium. The details are within a div tag within p tag and the dic tag is activated only when we click on p tag. I am getting information from the first p tag but cannot iterate through the next p tags. Its only selecting the first p tag and not collecting data from others Also Is it possible to find the number of pages to iterate to the end.
import requests
from bs4 import BeautifulSoup as bs
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
url = 'https://www.classicalmusicartists.com/cma/artists.aspx'
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(executable_path = '/home/ubuntu/selenium_drivers/chromedriver', options = options)
driver.get(url)
driver.implicitly_wait(2)
dat_html = driver.page_source
category = driver.find_element(By.ID,"ctl00_cphMainContent_lstCategory")
cat=Select(category)
cat.select_by_index(6)
driver.find_element(By.ID, "ctl00_cphMainContent_btnSearch").click()
list_span_elements = driver.find_elements("xpath","//div[@class='artists-by-category']/div/p[@class='expand-heading']")
time.sleep(1)
for x in list_span_elements:
driver.find_element(By.CLASS_NAME, "expand-heading").click()
name = x.find_element("xpath","//p['expand-heading clicked']").text
title = x.find_element("xpath","//div[@class='expand']").text
manager_name = x.find_element("xpath","//div[@class='artist-management-manager']").text
time.sleep(0.5)
country = x.find_element("xpath","//div[@class='artist-management-countries']").text
category = x.find_element("xpath","//div[@class='artist-management-categories']").text
contact_num = x.find_element("xpath","//div[@class='artist-management-telephone']").text
email = x.find_element("xpath","//div[@class='artist-management-email']").text
website = x.find_element("xpath","//div[@class='artist-management-website']").text
print(name, "\n",title,"\n", manager_name,"\n", country[9:],"\n", category[10:],"\n",
contact_num[3:],"\n", email[3:],"\n", website[3:])
driver.find_element(By.LINK_TEXT, "Next").click()
CodePudding user response:
Solution using scrapy with more elegant way
The webpage isn't dynamic meanimg all the required data is in static HTML DOM
I've made the pagination in starting url using
range function and for loop
Working code as an example:
import scrapy
class MusicSpider(scrapy.Spider):
name = 'music'
start_urls = ['https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=' str(x) '' for x in range(1,24)]
def parse(self, response):
for item in response.xpath('//*[@]'):
name = item.xpath('.//text()')
for card in item.xpath('.//following-sibling::*'):
if card.root.tag == "p":
break
title = card.xpath('.//*[@]/div[1]/text()').get()
phone = card.xpath('.//*[@ and contains(text(),"t:")]/../text()').get()
email = card.xpath('.//*[@ and contains(text(),"e:")]/../a/text()').get()
website = card.xpath('.//*[@ and contains(text(),"w:")]/../a/text()').get()
if title or phone or email or website:
d = {
'Name':''.join(name.getall()).strip().replace('\xa0',''),
'title': title,
'phonr':phone,
'email':email,
'website':website
}
yield d
Output:
{'Name': 'STOUT,David(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' 44 20 3176 5500', 'email': '[email protected]', 'website': 'www.rayfieldallied.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'STOYANOV,Vladimir(Baritone)', 'title': 'General Manager', 'phonr': ' 39 051 455 395', 'email': None, 'website': 'http://www.melosopera.com/en/'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'STRUCKMANN,Falk(Baritone)', 'title': 'General Manager', 'phonr': None, 'email': '[email protected]', 'website': 'www.arsis-artists.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'SUART,Richard(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' 44 1825 840437', 'email': '[email protected]', 'website': 'www.musichall.uk.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'SULIMSKY,Vladislav(Baritone)', 'title': 'General Manager', 'phonr': ' 33 1 4431 0010', 'email': '[email protected]', 'website': 'www.imgartists.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'SUMEGI,Daniel(Baritone, Bass, Bass-baritone)', 'title': 'Local Manager', 'phonr': ' 61 411 129 690', 'email': '[email protected]', 'website': 'www.patricktogher.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'SUMUEL,Michael(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' 1 212 994 3500', 'email': '[email protected]', 'website': 'www.imgartists.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'SZOT,Paulo(Baritone)', 'title': 'Local Manager', 'phonr': ' 33 (0) 9 77 80 22 43', 'email': None, 'website': 'https://backstage-opera.eu/'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'TANOVITSKI,Alexeï(Baritone, Bass, Bass-baritone)', 'title': 'Regional Manager', 'phonr': ' 33 1 4234 5347', 'email': '[email protected]', 'website': 'www.musicaglotz.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'TERFEL,Bryn(Baritone, Bass, Bass-baritone)', 'title': 'General Manager', 'phonr': ' 44 29 2075 0821', 'email': '[email protected]', 'website': 'www.harlequin-agency.co.uk'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'TÉZIER,Ludovic (Baritone)', 'title': 'General Manager', 'phonr': ' 49 89 290 7470', 'email': '[email protected]', 'website': 'www.hilbert.de'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'TÉZIER,Ludovic(Baritone, Bass-baritone)', 'title': 'General Manager', 'phonr': ' 49 89 290 7470', 'email': '[email protected]', 'website': 'www.hilbert.de'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'THATCHER,Harry(Baritone)', 'title': 'General Manager', 'phonr': ' 07720773910', 'email': None, 'website':
'www.stevephillipsmanagement.co.uk'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'THIRION,Ivan(Baritone)', 'title': 'General Manager', 'phonr': ' 32 9 330 3990', 'email': '[email protected]', 'website': 'www.arien-artists.com'}
2022-09-24 17:43:53 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.classicalmusicartists.com/cma/artists.aspx?Artist=&lstCategory=151&selectedArtistId=&page_num=21>
{'Name': 'TIBBETTS,John(Baritone)', 'title': 'General Manager', 'phonr': ' 1 617 651 4600', 'email': None, 'website': 'www.athloneartists.com'}
2022-09-24 17:43:53 [scrapy.core.engine] INFO: Closing spider (finished)
2022-09-24 17:43:53 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 7232,
'downloader/request_count': 23,
'downloader/request_method_count/GET': 23,
'downloader/response_bytes': 1831629,
'downloader/response_count': 23,
'downloader/response_status_count/200': 23,
'elapsed_time_seconds': 13.257796,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 9, 24, 11, 43, 53, 287520),
'item_scraped_count': 457,
CodePudding user response:
Selenium
is not need, also the expanding, cause content is available, just not displayed.
Example
Note: For demonstration purpose I sliced categories, simply remove it, to get more results - To iterate also the pages in categories simply adapt the approach.
from bs4 import BeautifulSoup
import requests
url = 'https://www.classicalmusicartists.com/cma/artists.aspx'
result = requests.get(url)
soup = BeautifulSoup(result.text)
data = []
for c in soup.select('#ctl00_cphMainContent_lstCategory option')[1:5]:
soup = BeautifulSoup(requests.get(f'{url}?lstCategory={c.get("value")}').content)
for a in soup.select('.artist-item'):
data.append({
'name': a.find_previous('p').get_text(strip=True).replace('\xa0',''),
'title': a.select_one('.artist-management-general-manager-title').get_text(strip=True) if a.select_one('.artist-management-general-manager-title') else None,
'manager': a.select_one('.artist-management-manager').get_text(strip=True),
'and':'whatever is needed'
})
print(data)
Example output
[{'name': 'AL-SHUBBAK,Tarek(Accompanist, Piano)', 'title': 'General Manager', 'manager': 'Gunnar Strømsholm, Manager', 'and': 'whatever is needed'}, {'name': 'ASTI,Eugene(Accompanist)', 'title': 'General Manager', 'manager': 'Sioned Jones, Artist Manager', 'and': 'whatever is needed'}, {'name': 'BAILLIEU,James(Accompanist)', 'title': 'General Manager', 'manager': 'Isabella Pitman, Associate Artist Manager', 'and': 'whatever is needed'}, {'name': 'BURNSIDE,Iain(Accompanist)', 'title': 'General Manager', 'manager': 'Hannah Bishay, Assistant Artist Manager', 'and': 'whatever is needed'}, {'name': 'BUSHAKEVITZ,Ammiel(Accompanist, Piano)', 'title': None, 'manager': 'John Owen, Director', 'and': 'whatever is needed'}, {'name': 'DOIDGE,David(Accompanist)', 'title': 'General Manager', 'manager': 'Sioned Jones, Artist Manager', 'and': 'whatever is needed'}, {'name': 'DRAKE,Julius(Accompanist)', 'title': 'General Manager', 'manager': 'Ashley Thouret, Artist Manager', 'and': 'whatever is needed'}, {'name': 'GERGIEVA,Larissa(Accompanist)', 'title': 'General Manager', 'manager': 'Mark Hildrew -- Merged Into -- Hildrew, Executive Director', 'and': 'whatever is needed'}, {'name': 'GLYNN,Christopher(Accompanist)', 'title': 'General Manager', 'manager': 'Helen Hogh, Artist Manager', 'and': 'whatever is needed'}, {'name': 'GODIN,Olivier(Accompanist, Piano)', 'title': 'General Manager', 'manager': 'Marie-Catherine LaPointe, President', 'and': 'whatever is needed'}, {'name': 'HOWARD,Jeff(Accompanist)', 'title': 'General Manager', 'manager': 'Rhian Williams, Artist Manager', 'and': 'whatever is needed'}, {'name': 'HUBER,Gerold(Accompanist)', 'title': 'General Manager', 'manager': 'Verena Vetter, Director', 'and': 'whatever is needed'}, {'name': 'ILJA,Ivari(Accompanist)', 'title': 'General Manager', 'manager': 'Paul Meyer zu Schwabedissen, Artist Manager', 'and': 'whatever is needed'}, {'name': 'JOHNSON,Graham(Accompanist)', 'title': 'General Manager', 'manager': 'Hannah Bishay, Assistant Artist Manager', 'and': 'whatever is needed'},...]