I want to get all inside urll(base_url a_['href']) and I do pagination with (URL str(page)) for first links but I think there is a problem. Because when I scrape for 10 pages(for page in range(1,11):) It just gave me 55 rows but it must be 260 row I do not know what is the problem.
import requests
from bs4 import BeautifulSoup as bs
import bs4
import pandas as pd
URL = 'https://yeniemlak.az/elan/axtar?emlak=1&elan_nov=1&seher[]=0&metro[]=0&qiymet=&qiymet2=&mertebe=&mertebe2=&otaq=&otaq2=&sahe_m=&sahe_m2=&sahe_s=&sahe_s2=&page='
base_url = 'https://yeniemlak.az/'
urla =[]
featuress = []
for page in range(6,11):
result = requests.get(URL str(page))
soup = bs(result.text, 'html.parser')
case = soup.find_all('table', class_ = 'list')
for fix in case:
a_ = fix.find('a')
urll = base_url a_['href']
URLL = requests.get(urll)
soup = bs(URLL.text, 'html.parser')
aa = soup.find_all('div', class_ = 'box')
for iss in aa:
feature = (aa[0].text)
if 'Təmirli' in feature:
Təmiri = 1
else:
Təmiri = 0
urla.append(urll)
featuress.append(Təmiri)
df = pd.DataFrame({'URL':urla,'Təmiri':featuress})
df = df.drop_duplicates()
df.to_excel('jdjd.xlsx', index = False)
CodePudding user response:
The site has DDoS protection thus when the server receives a lot of traffic from an IP it blocks its service to that IP, therefore using requests is not a viable method. The alternative is to use selenium for scraping the data as it works against some sites like https://www.askgamblers.com/online-casinos/reviews/casino-friday which has Cloudflare DDoS protection. Hope this helps. Happy Coding :)
CodePudding user response:
Your issue with requests
itself, You've to use a client which is supporting http2
as that site is using it.
For instance, You can use httpx
as below and don't Thread it unless you use a rotating proxy.
import httpx
import trio
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
from urllib.parse import urljoin
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
limiter = trio.CapacityLimiter(1)
async def get_soup(content):
return BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('table', attrs={'class': 'list'}))
async def worker(client, page, sender):
async with limiter, sender:
params = {
"elan_nov": "1",
"emlak": "1",
"mertebe": "",
"mertebe2": "",
"metro[]": "0",
"otaq": "",
"otaq2": "",
"page": page,
"qiymet": "",
"qiymet2": "",
"sahe_m": "",
"sahe_m2": "",
"sahe_s": "",
"sahe_s2": "",
"seher[]": "0"
}
while True:
try:
r = await client.get('axtar', params=params)
if r.is_success:
break
except httpx.RequestError:
continue
soup = await get_soup(r.content)
await sender.send([urljoin(str(client.base_url), x['href'])
for x in soup.select('td[rowspan="2"] > a')])
async def main():
async with httpx.AsyncClient(headers=headers, http2=True, base_url='https://yeniemlak.az/elan/') as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for page in range(1, 11):
nurse.start_soon(worker, client, page, sender.clone())
await trio.sleep(1)
async def rec(receiver):
allin = []
async with receiver:
async for val in receiver:
allin = val
df = pd.DataFrame(allin, columns=['URL'])
print(df)
if __name__ == "__main__":
trio.run(main)
Output:
URL
0 https://yeniemlak.az/elan/satilir-2-otaqli-bin...
1 https://yeniemlak.az/elan/satilir-5-otaqli-bin...
2 https://yeniemlak.az/elan/satilir-3-otaqli-bin...
3 https://yeniemlak.az/elan/satilir-3-otaqli-bin...
4 https://yeniemlak.az/elan/satilir-2-otaqli-bin...
.. ...
245 https://yeniemlak.az/elan/satilir-2-otaqli-bin...
246 https://yeniemlak.az/elan/satilir-2-otaqli-bin...
247 https://yeniemlak.az/elan/satilir-3-otaqli-bin...
248 https://yeniemlak.az/elan/satilir-3-otaqli-bin...
249 https://yeniemlak.az/elan/satilir-3-otaqli-bin...
[250 rows x 1 columns]