Home > Software design >  How can i scrape img src with select in python?
How can i scrape img src with select in python?

Time:07-20

I can't extract image src's. The class in the code is the class of the img tag. I got a KeyError when I tried to use another tag's class. How can i get the src of the image?

from base64 import decode
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from time import sleep
from random import randint
import numpy as np

headers = dict()
headers[
    "User-Agent"
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
images = []
titles = []
authors = []
pages = np.arange(1, 2, 1)
for page in pages:
    url = "https://www.dr.com.tr/kategori/Kitap/Cocuk-ve-Genclik/grupno=00884?Page="   str(page)
    results = requests.get(url, headers=headers)
    soup = BeautifulSoup(results.text, "html.parser")
    book_div = soup.find_all("div", class_="prd-main-wrapper")
    sleep(randint(2, 10))
    for bookSection in book_div:
        all_imgs = [img["src"] for img in soup.select(".lazyloaded")]
        images.append(all_imgs)
        print(all_imgs)

        name = bookSection.find("a", class_="prd-name").get('title')
        titles.append(name)

        author = bookSection.find("div", class_="prd-row").text.strip()
        authors.append(author)
        
  
books = pd.DataFrame(
    {
        "Image": images,
        "Book": titles,
        "Author": authors,
    }
)
books.to_csv("dr_child.csv", index=False, header=True,encoding = 'utf-8-sig')

As a result, only this [] returns.

CodePudding user response:

import httpx
import pandas as pd
import trio
from bs4 import BeautifulSoup, SoupStrainer

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'
}


async def get_soup(content):
    return BeautifulSoup(content, 'lxml', parse_only=SoupStrainer('div', class_='prd-main-wrapper'))

allin = []


async def worker(rec):
    async with rec:
        async for client, page in rec:
            print(f'Extracting Page# {page}')
            params = {
                'Page': page
            }
            while True:
                try:
                    r = await client.get('grupno=00884', params=params)
                    if r.is_success:
                        break
                except httpx.RequestError:
                    continue
            soup = await get_soup(r.content)
            allin.extend([
                (
                    x.select_one('div.prd-row a').get_text(strip=True),
                    x.select_one('a.prd-name')['title'],
                    x.select_one('img')['data-src']
                )
                for x in soup.select('div.prd-main-wrapper')])


async def main():
    async with httpx.AsyncClient(timeout=10, headers=headers, base_url='https://www.dr.com.tr/kategori/Kitap/Cocuk-ve-Genclik/') as client, trio.open_nursery() as nurse:
        sender, receiver = trio.open_memory_channel(0)
        async with receiver:
            for _ in range(10):
                nurse.start_soon(worker, receiver.clone())

            async with sender:
                for page in range(1, 3):
                    await sender.send([client, page])
    df = pd.DataFrame(allin, columns=['Title', 'Name', 'Image'])
    print(df)
    # df.to_csv('data.csv',index=False,encoding='utf-8-sig')

if __name__ == "__main__":
    trio.run(main)

Output:

Extracting Page# 1
Extracting Page# 2
                    Title  ...                                              Image
0          Stephanie Moss  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
1            Nikola Bozic  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
2           Ferenc Molnar  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
3         Liane Schneider  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
4    Helen Stratton Would  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
..                    ...  ...                                                ...   
85  Constanze Von Kitzing  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
86       Aziz Sivaslıoğlu  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
87          J. K. Rowling  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
88         Susanna Tamaro  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   
89            Sinan Yaşar  ...  https://i.dr.com.tr/cache/154x170-0/originals/...   

[90 rows x 3 columns]

CodePudding user response:

Just change this line:

all_imgs = [img["src"] for img in soup.select(".lazyloaded")]

To this line:

all_imgs = [img["data-src"] for img in soup.select(".prd img")]

CodePudding user response:

I noticed now this is a different site than your previous question.

This will get all the images from a page:

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'
}

r = requests.get('https://www.dr.com.tr/kategori/Kitap/Cocuk-ve-Genclik/grupno=00884?Page=1')
soup = BeautifulSoup(results.text, "html.parser")
all_imgs = [img.get('data-src') for img in soup.select('img.lazy')]
print(all_imgs)

Result:

['https://cdn.bkmkitap.com/cok-bekledim-seni-yaz-tatili-kral-sakir-11-11384602-72-K.jpg', 'https://cdn.bkmkitap.com/dragon-ball-1-ve-2-9437721-50-K.jpg', 'https://cdn.bkmkitap.com/lo-10742711-71-K.jpg', 'https://cdn.bkmkitap.com/gunesi-uyandiralim-358512-11575004-35-K.jpg', 'https://cdn.bkmkitap.com/monster-cilt-2-11691276-71-K.jpg', 'https://cdn.bkmkitap.com/haikyu-4-9464025-44-K.jpg',...]
  • Related