Home > database >  scrape multiple pages data are overwritten
scrape multiple pages data are overwritten

Time:07-30

Data are overwritten and they give the data of only last page how to solve these problem is any solution for these kindly recommend me I've seen several solutions to scrape multiple pages from a website, but couldn't make it work on my code

    import scrapy
    from scrapy import FormRequest
    from scrapy.crawler import CrawlerProcess
    from scrapy.http import Request
    
    class TestSpider(scrapy.Spider):
        name = 'test'
        url = 'https://advpalata.vrn.ru/registers/reestr_lawyers/'
        
        for x in range(0,5):
            payload='p=' str(x) '&letterfilter=А'
            
            headers = {
            'authority': 'advpalata.vrn.ru',
            'accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-language': 'en-US,en;q=0.9',
            'cache-control': 'max-age=0',
            'content-type': 'application/x-www-form-urlencoded',
            'cookie': 'PHPSESSID=546743b283bb9e3b2e78dabbfb894220; ie=yes; stat_id=546743b283bb9e3b2e78dabbfb894220; _ym_uid=1658939896610936176; _ym_d=1658939896; _ym_isad=2; PHPSESSID=546743b283bb9e3b2e78dabbfb894220',
            'origin': 'https://advpalata.vrn.ru',
            'referer': 'https://advpalata.vrn.ru/registers/reestr_lawyers/',
            'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
            }
    
    
            def start_requests(self):
                yield scrapy.FormRequest(
                    url=self.url,
                    method='POST',
                    body=self.payload,
                    headers=self.headers,
                    callback=self.parse_item,
                    )
    
            def parse_item(self, response):
            
                books = response.xpath("//td[@class='name']//a//@href").extract()
                for book in books:
                    absolute_url = response.urljoin(book)
                    yield Request(absolute_url, callback=self.parse_book)
                    
            def parse_book(self, response):
                title=response.css("h3::text").get()
                # phone = response.xpath("//div[@class='advocate-right']//p['@umi:field-name=phone']//text()").get()
                # email = response.xpath("//div[@class='advocate-right']//p['@umi:field-name=email']//a//text()").get()
    
            
                yield{
                    'title':title,
                    # 'phone':phone,
                    # 'email':email
                }

CodePudding user response:

The key is you needs to use next() in yield loop for iteration.

This code works

Sorry words and w_pages were hard-cord

import scrapy
from scrapy.http import Request
import urllib.parse

class TestSpider(scrapy.Spider):
    name = 'test'

    url = 'https://advpalata.vrn.ru/registers/reestr_lawyers'
    headers = {
        'authority': 'advpalata.vrn.ru',
        'accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'max-age=0',
        'content-type': 'application/x-www-form-urlencoded',
        'cookie': 'PHPSESSID=546743b283bb9e3b2e78dabbfb894220; ie=yes; stat_id=546743b283bb9e3b2e78dabbfb894220; _ym_uid=1658939896610936176; _ym_d=1658939896; _ym_isad=2; PHPSESSID=546743b283bb9e3b2e78dabbfb894220',
        'origin': 'https://advpalata.vrn.ru',
        'referer': 'https://advpalata.vrn.ru/registers/reestr_lawyers/',
        'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    }

    def start_requests(self):
        words = ['А','Б','В','Г','Д','Е','Ж','З','И','К','Л', 'М','Н','О','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Э','Ю','Я','Все']
        w_pages=[14 , 24, 9 , 19, 11, 0 , 5 , 9 ,  5,39 , 12, 21, 10,  4,  10, 28, 12,  2,  5,  4,  1,  7, 11,  1,  1,  2,  3, 315]
        index = 0
        for word in words:
            pages = list(range(0,w_pages[index]))
            index = index   1
            x=iter(pages)
            for dummy in pages:
                yield scrapy.Request(
                    url=self.url,
                    method='POST',
                    body="p={0}&letterfilter={1}".format(next(x),urllib.parse.quote_plus(word)),
                    headers=self.headers, 
                    callback=self.parse
                )

    def parse(self, response):
        books = response.xpath("//td[@class='name']//a//@href").extract()
        for book in books:
            absolute_url = response.urljoin(book)
            yield Request(absolute_url, callback=self.parse_book)
                    
    def parse_book(self, response):
        title=response.css("h3::text").get()
        yield{
            'title':title
        }

if want to save correct title into json file add this line in settings.py

FEED_EXPORT_ENCODING = 'utf-8'

I got 3151 titles by this code. I copy here first 10 and last 10 titles

{"title": "Алфёрова Людмила Григорьевна"},
{"title": "Александров Юрий Иванович"},
{"title": "Агрба Платон Заурович"},
{"title": "Аветисов Сергей Эдуардович"},
{"title": "Аксенов Дмитрий Михайлович"},
{"title": "Акиньшина Галина Сергеевна"},
{"title": "Ануфриенко Андрей Иванович"},
{"title": "Анищенко Игорь Сергеевич"},
{"title": "Арженых Оксана Сергеевна"},
{"title": "Астанин Владимир Егорович"},
...
{"title": "Щербакова Татьяна Николаевна"},
{"title": "Щербакова Лилия Ивановна"},
{"title": "Щурков Алексей Михайлович"},
{"title": "Щетинина Татьяна Николаевна"},
{"title": "Щукина Валентина Васильевна"},
{"title": "Щетинина Галина Александровна"},
{"title": "Щетинина Ирина Юрьевна"},
{"title": "Ююкин Александр Викторович"},
{"title": "Щербинин Андрей Викторович"},
{"title": "Щербакова Лилия Викторовна"}

I referenced two links yeild and encoding uni-code

  • Related