I'm new to web scraping and I want to scrape the information of all the products from a website.
I've written a sample code to scrape data which goes as:
def start_requests(self):
urls = [
'https://www.trendyol.com/camasir-deterjani-x-c108713',
'https://www.trendyol.com/yumusaticilar-x-c103814',
'https://www.trendyol.com/camasir-suyu-x-c103812',
'https://www.trendyol.com/camasir-leke-cikaricilar-x-c103810',
'https://www.trendyol.com/camasir-yan-urun-x-c105534',
'https://www.trendyol.com/kirec-onleyici-x-c103806',
'https://www.trendyol.com/makine-kirec-onleyici-ve-temizleyici-x-c144512'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse, meta=meta, dont_filter=True)
def parse(self, response):
soup = BeautifulSoup(response.text, 'lxml')
data = re.search(r"__SEARCH_APP_INITIAL_STATE__=(.*?});", response.text)
data = json.loads(data.group(1))
for p in data["products"]:
item=TeknosaItem()
item['rowid'] = hash(str(datetime.datetime.now()) str(p["id"]))
item['date'] = str(datetime.datetime.now())
item['listing_id'] = p["id"]
item['product_id'] = p["id"]
item['product_name'] = p["name"]
item['price'] = p["price"]["sellingPrice"]
item['url'] = p["url"]
yield item
The code I've written is able to scrape in data for all the products that are listed on first page but as we scroll down the page the page loads more data dynamically via Ajax GET requests and it is not able to scrape that data. I've watched some of the videos and read some articles to but I was not able to figure out how can I scroll data that is being generated dynamically on scrolling. Any help on this will be appreciated.
I found infinite page example on target site:
CodePudding user response:
I don't use Scrapy, buy you can adjust next example how to get all products from the category (using their Ajax API):
import requests
categories = [
"camasir-deterjani-x-c108713",
"yumusaticilar-x-c103814",
"camasir-suyu-x-c103812",
"camasir-leke-cikaricilar-x-c103810",
"camasir-yan-urun-x-c105534",
"kirec-onleyici-x-c103806",
"makine-kirec-onleyici-ve-temizleyici-x-c144512",
]
# iterate over categories to construct api_url
# here I will only get products from first category:
api_url = (
"https://public.trendyol.com/discovery-web-searchgw-service/v2/api/infinite-scroll/"
categories[0]
)
payload = {
"pi": 1,
"culture": "tr-TR",
"userGenderId": "1",
"pId": "0",
"scoringAlgorithmId": "2",
"categoryRelevancyEnabled": "false",
"isLegalRequirementConfirmed": "false",
"searchStrategyType": "DEFAULT",
"productStampType": "TypeA",
"fixSlotProductAdsIncluded": "false",
}
page = 1
while True:
payload["pi"] = page
data = requests.get(api_url, params=payload).json()
if not data["result"]["products"]:
break
for p in data["result"]["products"]:
name = p["name"]
id_ = p["id"]
price = p["price"]["sellingPrice"]
u = p["url"]
print("{:<10} {:<50} {:<10} {}".format(id_, name[:49], price, u[:60]))
page = 1
This will get all products from the category:
...
237119563 Organik Sertifikalı Çamaşır Deterjanı 63 /eya-clean/organik-sertifikali-camasir-deterjani-p-237119563
90066873 Toz Deterjan Sık Yıkananlar 179 /bingo/toz-deterjan-sik-yikananlar-p-90066873
89751820 Sıvı Çamaşır Deterjanı 2 x3L (100 Yıkama) Renkli 144.9 /perwoll/sivi-camasir-deterjani-2-x3l-100-yikama-renkli-siya
112627101 Sıvı Çamaşır Deterjanı (95 Yıkama) 3L Renkli 2, 144.9 /perwoll/sivi-camasir-deterjani-95-yikama-3l-renkli-2-7l-cic
95398460 Toz Çamaşır Deterjanı Active Beyazlar Ve Renklile 180.99 /omo/toz-camasir-deterjani-active-beyazlar-ve-renkliler-10-k
...
CodePudding user response:
So honestly I think the best way is to get the info from the API, but I wanted to answer you question about pagination.
So you can see when you scroll that the url changes (?pi=pagenumber
), so we can loop through the pages, and when we get to a page that doesn't exist (404 status), we'll handle the status code and break from the loop.
import scrapy
import logging
import json
import datetime
class ExampleSpider(scrapy.Spider):
name = 'ExampleSpider'
start_urls = [
'https://www.trendyol.com/camasir-deterjani-x-c108713',
'https://www.trendyol.com/yumusaticilar-x-c103814',
'https://www.trendyol.com/camasir-suyu-x-c103812',
'https://www.trendyol.com/camasir-leke-cikaricilar-x-c103810',
'https://www.trendyol.com/camasir-yan-urun-x-c105534',
'https://www.trendyol.com/kirec-onleyici-x-c103806',
'https://www.trendyol.com/makine-kirec-onleyici-ve-temizleyici-x-c144512'
]
handle_httpstatus_list = [404]
custom_settings = {'DOWNLOAD_DELAY': 0.4}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, cb_kwargs={'base_url': url, 'page_number': 0}, callback=self.parse_page)
def parse_page(self, response, base_url, page_number):
# last page
if response.status == 404:
logging.log(logging.INFO, f'Finished scraping {base_url}')
return
# You don't need to use beautifulsoup, and you can and the regex directly
all_data = response.xpath('//script[@type="application/javascript"]/text()').re(r'__SEARCH_APP_INITIAL_STATE__=(.*?});')
for data in all_data: # supposed to be only one element, but still...
data = json.loads(data)
for p in data["products"]:
# item=TeknosaItem()
item = dict()
item['rowid'] = hash(str(datetime.datetime.now()) str(p["id"]))
item['date'] = str(datetime.datetime.now())
item['listing_id'] = p["id"]
item['product_id'] = p["id"]
item['product_name'] = p["name"]
item['price'] = p["price"]["sellingPrice"]
item['url'] = p["url"]
yield item
# go to the next page
page_number = 1
yield scrapy.Request(url=base_url f'?pi={str(page_number)}', cb_kwargs={'base_url': base_url, 'page_number': page_number}, callback=self.parse_page)