after adding kward script stopped to output any scraped data, it only outputed normal spider debug data. I have completly no idea why the hell it does that, it looks like whole parseMain is just sittin there and doin nothing.
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div.root')
for websitep in websites:
websiteurl = websitep.css('div.rp-l0pkv6 a::attr(href)').get()
href = websitep.css('li.rp-np9kb1 a::attr(href)').get()
url = response.urljoin(href)
yield Request(url, cb_kwargs={'websiteurl': websiteurl}, callback=self.parseMain)
def parseMain(self, response, websiteurl):
# def parse(self, response):
for quote in response.css('.rp-y89gny.eboilu01 ul li'):
address = quote.css('address.rp-o9b83y::text').get(),
name = quote.css('h2.rp-69f2r4::text').get(),
href = quote.css('li.rp-np9kb1 a::attr(href)').get(),
PAGETEST = response.css('a.rp-mmikj9::attr(href)').get()
yield {
'address' : address,
'name' : name,
'href' : href,
'PAGETEST' : PAGETEST,
'websiteurl' : websiteurl
}
next_page=response.css('a.rp-mmikj9::attr(href)').get()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback= self.parse)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance. EDIT: Oh shoot i forgot to tell what my code is supposed to do. Basicly parse is getting website url from inside of subPages like "https://rynekpierwotny.pl/deweloperzy/dom-development-sa-955/". While parseMain is getting all data(like address,name) from main page "https://rynekpierwotny.pl/deweloperzy/?page=1".
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
for quote in response.css('.rp-y89gny.eboilu01 ul li'):
yield {
'address' : quote.css('address.rp-o9b83y::text').get(),
'name' : quote.css('h2.rp-69f2r4::text').get(),
'href' : quote.css('li.rp-np9kb1 a::attr(href)').get(),
'PAGETEST' : response.css('a.rp-mmikj9::attr(href)').get()
}
next_page=response.css('a.rp-mmikj9::attr(href)').get()
if next_page is not None:
next_page_link=response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback= self.parse)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
This worked
CodePudding user response:
Edit:
I made some further adjustments based on your notes of what you want to program to do. It should work the way you expect now.
try this instead:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(@class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()