Home > Software design >  I think I have pagination problem when I do webscraping
I think I have pagination problem when I do webscraping

Time:12-08

I want to get all inside urll(base_url a_['href']) and I do pagination with (URL str(page)) for first links but I think there is a problem. Because when I scrape for 10 pages(for page in range(1,11):) It just gave me 55 rows but it must be 260 row I do not know what is the problem.

import requests
from bs4 import BeautifulSoup as bs
import bs4
import pandas as pd

URL = 'https://yeniemlak.az/elan/axtar?emlak=1&elan_nov=1&seher[]=0&metro[]=0&qiymet=&qiymet2=&mertebe=&mertebe2=&otaq=&otaq2=&sahe_m=&sahe_m2=&sahe_s=&sahe_s2=&page='

base_url = 'https://yeniemlak.az/'

urla =[]
featuress = []

for page in range(6,11):
    result = requests.get(URL   str(page))
    soup = bs(result.text, 'html.parser')
    case = soup.find_all('table', class_ = 'list')
    for fix in case:
        a_ = fix.find('a')
        urll = base_url   a_['href']
        URLL = requests.get(urll)
        soup = bs(URLL.text, 'html.parser')
        aa = soup.find_all('div', class_ = 'box')
        for iss in aa:
            feature = (aa[0].text)
            if 'Təmirli' in feature:
                Təmiri  = 1
            else:
                Təmiri = 0    
            urla.append(urll)
            featuress.append(Təmiri)            
            df = pd.DataFrame({'URL':urla,'Təmiri':featuress})
            df = df.drop_duplicates() 
            df.to_excel('jdjd.xlsx', index = False)


CodePudding user response:

The site has DDoS protection thus when the server receives a lot of traffic from an IP it blocks its service to that IP, therefore using requests is not a viable method. The alternative is to use selenium for scraping the data as it works against some sites like https://www.askgamblers.com/online-casinos/reviews/casino-friday which has Cloudflare DDoS protection. Hope this helps. Happy Coding :)

CodePudding user response:

Your issue with requests itself, You've to use a client which is supporting http2 as that site is using it.

For instance, You can use httpx as below and don't Thread it unless you use a rotating proxy.

import httpx
import trio
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from urllib.parse import urljoin

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
    "Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}

limiter = trio.CapacityLimiter(1)


async def get_soup(content):
    return BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('table', attrs={'class': 'list'}))


async def worker(client, page, sender):
    async with limiter, sender:
        params = {
            "elan_nov": "1",
            "emlak": "1",
            "mertebe": "",
            "mertebe2": "",
            "metro[]": "0",
            "otaq": "",
            "otaq2": "",
            "page": page,
            "qiymet": "",
            "qiymet2": "",
            "sahe_m": "",
            "sahe_m2": "",
            "sahe_s": "",
            "sahe_s2": "",
            "seher[]": "0"
        }
        while True:
            try:
                r = await client.get('axtar', params=params)
                if r.is_success:
                    break
            except httpx.RequestError:
                continue
        soup = await get_soup(r.content)
        await sender.send([urljoin(str(client.base_url), x['href'])
                           for x in soup.select('td[rowspan="2"] > a')])


async def main():
    async with httpx.AsyncClient(headers=headers, http2=True, base_url='https://yeniemlak.az/elan/') as client, trio.open_nursery() as nurse:
        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)
        async with sender:
            for page in range(1, 11):
                nurse.start_soon(worker, client, page, sender.clone())
                await trio.sleep(1)


async def rec(receiver):
    allin = []
    async with receiver:
        async for val in receiver:
            allin  = val
    df = pd.DataFrame(allin, columns=['URL'])
    print(df)

if __name__ == "__main__":
    trio.run(main)

Output:

                                                   URL
0    https://yeniemlak.az/elan/satilir-2-otaqli-bin...
1    https://yeniemlak.az/elan/satilir-5-otaqli-bin...
2    https://yeniemlak.az/elan/satilir-3-otaqli-bin...
3    https://yeniemlak.az/elan/satilir-3-otaqli-bin...
4    https://yeniemlak.az/elan/satilir-2-otaqli-bin...
..                                                 ...
245  https://yeniemlak.az/elan/satilir-2-otaqli-bin...
246  https://yeniemlak.az/elan/satilir-2-otaqli-bin...
247  https://yeniemlak.az/elan/satilir-3-otaqli-bin...
248  https://yeniemlak.az/elan/satilir-3-otaqli-bin...
249  https://yeniemlak.az/elan/satilir-3-otaqli-bin...

[250 rows x 1 columns]
  • Related