Hello everyone I'm a beginner at scraping and i try to scrape all iPhones in https://www.electroplanet.ma/ this is the scripts i wrote
import scrapy
from ..items import EpItem
class ep(scrapy.Spider):
name = "ep"
start_urls = ["https://www.electroplanet.ma/smartphone-tablette-gps/smartphone/iphone?p=1",
"https://www.electroplanet.ma/smartphone-tablette-gps/smartphone/iphone?p=2"
]
def parse(self, response):
products = response.css("ol li") # to find all items in the page
for product in products :
try:
lien = product.css("a.product-item-link::attr(href)").get() # get the link of each item
image= product.css("a.product-item-photo::attr(href)").get() # get the image
# and to get in each item page and scrap it, i use follow method
# i passed image as argument to parse_item cauz i couldn't scrap the image from item's page
# i think it's hidden
yield response.follow(lien,callback = self.parse_item,cb_kwargs={"image":image})
except: pass
def parse_item(self,response,image):
item = EpItem()
item["Nom"]= response.css(".ref::text").get()
pattern = re.compile(r"\s*(\S (?:\s \S )*)\s*")
item["Catégorie"]= pattern.search(response.xpath("//h1/a/text()").get()).group(1)
item["Marque"]=pattern.search(response.xpath("//*[@data-th='Marque']/text()").get()).group(1)
try :
item["RAM"]= pattern.search(response.xpath("//*[@data-th='MÉMOIRE RAM']/text()").get()).group(1)
except:
pass
item["ROM"]=pattern.search(response.xpath("//*[@data-th='MÉMOIRE DE STOCKAGE']/text()").get()).group(1)
item["Couleur"]=pattern.search(response.xpath("//*[@data-th='COULEUR']/text()").get()).group(1)
item["lien"]=response.request.url
item["image"]=image
item["état"]="neuf"
item["Market"]= "Electro Planet"
yield item
i found problems to scrape all the pages, because it uses javascript to follow pages so i write all pages links in start urls and i believe it's not the best practice so i ask you to give some advices to improve my code
CodePudding user response:
you can use the scrapy-playwright plugin to scrape the interactive websites, and for the start_urls, just add the main website index URL if there is just one website, and check this link in the scrapy docs to make the spider follow the pages links automatically instead of written them manually