How to get src in an image using class?-CodePudding

Hi I am trying to get the src data from the image on the website, I locate the image using the class since it is unique. With the code below it is able to locate the image but is unable to save the image to mongodb and shows up as null, so want to find the src and save the link instead.

ps. the code works for other classes but not sure how to locate the src and save it into "findImage".

https://myaeon2go.com/products/category/6236298/vegetable

postal code is : 56000


cate_list = [
    "https://myaeon2go.com/products/category/1208101/fresh-foods",
    "https://myaeon2go.com/products/category/8630656/ready-to-eat",
    "https://myaeon2go.com/products/category/6528959/grocery",
    "https://myaeon2go.com/products/category/6758871/snacks",
    "https://myaeon2go.com/products/category/8124135/chill-&-frozen",
    "https://myaeon2go.com/products/category/4995043/beverage",
    "https://myaeon2go.com/products/category/3405538/household",
    "https://myaeon2go.com/products/category/493239/baby-&-kids",
]


cookies = {
    "hideLocationOverlay": "true",
    "selectedShippingState": "Kuala Lumpur",
    "selectedPostalCode": "56000",
}

for x in range(len(cate_list)):

    url = cate_list[x]

    # convert soup to readable html
    result = requests.get(url, cookies=cookies)
    doc = BeautifulSoup(result.text, "html.parser")

# a for loop located here to loop through all the products

                # <span >myAEON2go Signature Taman Maluri</span>
                findImage = j.find("img", {"class": "pgJEkulRiYnxQNzO8njV shown"})

CodePudding user response：

To extract the value of src attribute simply call .get('src') on your element.

Try to change your strategy selecting elements and avoid using classes that are often dynamically - I recommend to use more static identifier as well as HTML structure.

for url in cate_list:

    result = requests.get(url, cookies=cookies,headers = {'User-Agent': 'Mozilla/5.0'})
    doc = BeautifulSoup(result.text, "html.parser")
    for e in doc.select('.g-product-list li'):
        print(e.img.get('src'))

Note: Iterating your list do not need range(len()) construct

Example

import requests
from bs4 import BeautifulSoup

cate_list = [
    "https://myaeon2go.com/products/category/1208101/fresh-foods",
    "https://myaeon2go.com/products/category/8630656/ready-to-eat",
    "https://myaeon2go.com/products/category/6528959/grocery",
    "https://myaeon2go.com/products/category/6758871/snacks",
    "https://myaeon2go.com/products/category/8124135/chill-&-frozen",
    "https://myaeon2go.com/products/category/4995043/beverage",
    "https://myaeon2go.com/products/category/3405538/household",
    "https://myaeon2go.com/products/category/493239/baby-&-kids",
]


cookies = {
    "hideLocationOverlay": "true",
    "selectedShippingState": "Kuala Lumpur",
    "selectedPostalCode": "56000",
}

for url in cate_list:

    result = requests.get(url, cookies=cookies,headers = {'User-Agent': 'Mozilla/5.0'})
    doc = BeautifulSoup(result.text, "html.parser")
    for e in doc.select('.g-product-list li'):
        print(e.img.get('src').split(')/')[-1])

Output

https://assets.myboxed.com.my/1659400060229.jpg
https://assets.myboxed.com.my/1662502067580.jpg
https://assets.myboxed.com.my/1658448744726.jpg
https://assets.myboxed.com.my/1627880003755.jpg
https://assets.myboxed.com.my/1662507451284.jpg
https://assets.myboxed.com.my/1662501936757.jpg
https://assets.myboxed.com.my/1659400602324.jpg
https://assets.myboxed.com.my/1627880346297.jpg
https://assets.myboxed.com.my/1662501743853.jpg
...

CodePudding user response：

import requests
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed

cookies = {
    "hideLocationOverlay": "true",
    "selectedShippingState": "Kuala Lumpur",
    "selectedPostalCode": "56000",
}

links = [
    "8630656/ready-to-eat",
    "1208101/fresh-foods",
    "6528959/grocery",
    "6758871/snacks",
    "8124135/chill-&-frozen",
    "4995043/beverage",
    "3405538/household",
    "493239/baby-&-kids",
]
allin = []


def get_soup(content):
    return BeautifulSoup(content, 'lxml', parse_only=SoupStrainer('img', class_="pgJEkulRiYnxQNzO8njV"))


def worker(req, url, link):
    r = req.get(url   link)
    soup = get_soup(r.content)
    return [urljoin(url, x['src']) for x in soup.select('img')]


def main(url):
    with requests.Session() as req, ThreadPoolExecutor(max_workers=10) as executor:
        req.cookies.update(cookies)
        fs = (executor.submit(worker, req, url, link) for link in links)
        for f in as_completed(fs):
            allin.extend(f.result())
        print(allin)


if __name__ == "__main__":
    main('https://myaeon2go.com/products/category/')