I do not anything how to scrape ajax pages there is no pagination on website the website will be load by clicking the load more button
these is the page link https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas as pd
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
wait_time=3,
screenshot=True,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
books = response.xpath("//h3[@class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title = response.css(".mr3-m::text").get()
address = response.css(".showcase-address::text").get()
address=address.strip()
website = response.xpath("//li[@class='dib ml3 mr3']//a[starts-with(@href, 'http')]/@href").get()
website=website.strip()
phone = response.xpath("//li[@class='dib ml3 mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
phone=phone.strip().replace("-","")
yield{
'title':title,
'address':address,
'website':website,
'phone':phone
}
CodePudding user response:
Okay, try the following script to get all the fields you wish to grab from there traversing all the exhibitor list:
import scrapy
from scrapy.selector import Selector
class MapYourShowSpider(scrapy.Spider):
name = "mapyourshow"
content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'
headers = {
'x-requested-with': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '557',
'start': '0',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.content_url,
method='GET',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
for item in response.json()['DATA']['results']['exhibitor']['hit']:
inner_link = self.inner_base.format(item['fields']['exhid_l'])
yield scrapy.Request(
url=inner_link,
headers=self.headers,
callback=self.parse_content,
)
def parse_content(self,response):
elem = response.json()['DATA']['BODYHTML']
sel = Selector(text=elem)
title = sel.css("h2::text").get()
try:
address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
except AttributeError: address = ""
website = sel.css("a[title*='website']::text").get()
phone = sel.xpath("normalize-space(//*[starts-with(@class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
yield {"title":title,"address":address,"website":website,"phone":phone}
CodePudding user response:
I have not used your code and did it rather my way (because I'm not a huge fan of selenium). But I hope this helps anyway:
import requests
import json
import time
from bs4 import BeautifulSoup
import re
headers = {
'x-requested-with': 'XMLHttpRequest',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '200', # don`t increase this too much (increase the start parameter instead and send a new request after some delay)
'start': '0',
}
response = requests.get('https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm', params=params, headers=headers)
data = json.loads(response.text)
all_sites = []
for exs in data["DATA"]["results"]["exhibitor"]["hit"]:
id = exs["fields"]["exhid_l"]
site = f"https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={id}"
all_sites.append(site)
for site in all_sites:
response = requests.get(site)
soup = BeautifulSoup(response.text, "html.parser")
info_box = soup.find("div", {"id":"showroomContentDiv"})
title = info_box.find("section", {"id":"scroll-description"}).text.strip().split("\n")[0][6:]
address = " ".join(info_box.find("p", {"class":"showcase-address"}).text.strip().split())
website = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[0].text.strip()
phone = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[1].text[7:].strip()
print(title)
print(address)
print(website)
print(phone)
# delay so you don't create too much traffic
time.sleep(1)