How to Add keys to python dicitonary dynamically?-CodePudding

I am trying to creating python dictionary keys dynamically in order to serve the data into csv file but not getting anywhere so far. Here is my code:

import requests
from bs4 import BeautifulSoup
import csv

class ZiwiScraper:

    results = []
    
    headers = {
    'authority': '99petshops.com.au',
    'accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en,ru;q=0.9',
    'cache-control': 'max-age=0',
    # Requests sorts cookies= alphabetically
    # 'cookie': 'TrackerGuid=f5419f8d-632a-46b1-aa04-eed027d03e89; _ga=GA1.3.1385392550.1666770065; _gid=GA1.3.1560927430.1666770065',
    'referer': 'https://www.upwork.com/',
    'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Yandex";v="22"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Linux"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'cross-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.114 YaBrowser/22.9.1.1110 (beta) Yowser/2.5 Safari/537.36',
}

    def fetch(self, url):
        print(f'HTTP GET request to URL: {url}', end='')
        res = requests.get(url, headers=self.headers)
        print(f' | Status Code: {res.status_code}')

        return res
    
    def parse(self, html):
        soup = BeautifulSoup(html, 'lxml')
        titles = [title.text.strip() for title in soup.find_all('h2')]
        low_prices = [low_price.text.split(' ')[-1] for low_price in soup.find_all('span', {'class': 'hilighted'})]
        store_names = []
        stores = soup.find_all('p')
        for store in stores:
            store_name = store.find('img')
            if store_name:
                store_names.append(store_name['alt'])
        shipping_prices = [shipping.text.strip() for shipping in soup.find_all('p', {'class': 'shipping'})]
        price_per_hundered_kg = [unit_per_kg.text.strip() for unit_per_kg in soup.find_all('p', {'class': 'unit-price'})]
        other_details = soup.find_all('div', {'class': 'pd-details'})
        
        for index in range(0, len(titles)):
            try:
                price_per_100_kg = price_per_hundered_kg[index]
            except:
                price_per_100_kg = ''
            try:
                lowest_prices = low_prices[index]
            except:
                lowest_prices = ''
            for detail in other_details:
                detail_1 = [pr.text.strip() for pr in detail.find_all('span', {'class': 'sp-price'})]
                for idx, price in enumerate(detail_1):
                    self.results.append({
                        'title': titles[index],
                        'lowest_prices': lowest_prices,
                        f'lowest_price_{idx}':  detail_1[idx],
                        'store_names': store_names[index],
                        'shipping_prices': shipping_prices[index],
                        'price_per_100_kg': price_per_100_kg,
                    })
                    
#         json_object = json.dumps(self.results, indent=4)
#         with open("ziwi_pets_2.json", "w") as outfile:
#             outfile.write(json_object)
            
    def to_csv(self):
        with open('ziwi_pets_2.csv', 'w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
            writer.writeheader()

            for row in self.results:
                writer.writerow(row)

            print('Stored results to "ziwi_pets_2.csv"')
            
    def run(self):
        for page in range(1):
            url = f'https://99petshops.com.au/Search?brandName=Ziwi Peak&animalCode=DOG&storeId=89/&page={page}'

            response = self.fetch(url)
        
            self.parse(response.text)
        
        self.to_csv()

if __name__ == '__main__':
    scraper = ZiwiScraper()
    scraper.run()

Every time I run the script it gives me the above code I got ValueError: dict contains fields not in fieldnames: 'lowest_price_1'. csv file however generating with one entry only.

title,lowest_prices,lowest_price_0,store_names,shipping_prices,price_per_100_kg
Ziwi Peak Dog Air-Dried Free Range Chicken Recipe 1Kg,$57.75,$64.60,Woofers World, $9.95 shipping,$5.78 per 100g

I tried to output it as json just to see the data formation and it was also not as I expected.

[
    {
        "title": "Ziwi Peak Dog Air-Dried Free Range Chicken Recipe 1Kg",
        "lowest_prices": "$57.75",
        "lowest_price_0": "$64.60",
        "store_names": "Woofers World",
        "shipping_prices": " $9.95 shipping",
        "price_per_100_kg": "$5.78 per 100g"
    },
    {
        "title": "Ziwi Peak Dog Air-Dried Free Range Chicken Recipe 1Kg",
        "lowest_prices": "$57.75",
        "lowest_price_1": "$64.60",
        "store_names": "Woofers World",
        "shipping_prices": " $9.95 shipping",
        "price_per_100_kg": "$5.78 per 100g"
    },
    {
        "title": "Ziwi Peak Dog Air-Dried Free Range Chicken Recipe 1Kg",
        "lowest_prices": "$57.75",
        "lowest_price_2": "$64.95",
        "store_names": "Woofers World",
        "shipping_prices": " $9.95 shipping",
        "price_per_100_kg": "$5.78 per 100g"
    },
]

I expected something like this:

[
    {
        "title": "Ziwi Peak Dog Air-Dried Free Range Chicken Recipe 1Kg",
        "lowest_prices": "$57.75",
        "lowest_price_0": "$64.60",
        "lowest_price_1": "$64.60",
        "lowest_price_2": "$64.95",
        "store_names": "Woofers World",
        "shipping_prices": " $9.95 shipping",
        "price_per_100_kg": "$5.78 per 100g"
    },
]

Can anyone please help me out here? Thanks.

CodePudding user response：

You were creating csv header based only upon the first result, which didn't contained whole set of unique keys.

Change to_csv function to this to resolve your problem:

def to_csv(self):
    key_list = list()
    for key_list_for_one_element in [list(x.keys()) for x in self.results]:
          key_list.extend(key_list_for_one_element)
    key_list = set(key_list)
    with open('ziwi_pets_2.csv', 'w') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=key_list)
        writer.writeheader()

        for row in self.results:
            writer.writerow(row)

        print('Stored results to "ziwi_pets_2.csv"')

CodePudding user response：

You need to pass as fieldnames all the keys any item in self.results might have. The error should go away if you just add that into your to_csv function

def to_csv(self):  
        fNames = list(set([k for r in self.results for k in r.keys()]))
        fNames.sort(key=lambda r: (
            -1*(int(r.replace('lowest_price_', '')) 1) if 'lowest_price_' in r else 0, r
        ), reverse=True)

        with open('ziwi_pets_2.csv', 'w') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fNames)
            writer.writeheader()

            for row in self.results:
                writer.writerow(row)

            print('Stored results to "ziwi_pets_2.csv"')

(The sort statement is optional - it's just to make sure the columns are in a certain order.)

As for this bit

I expected something like this

the for idx, price in enumerate(detail_1) in the parse function is appending a new line for every lowest_price_idx value of each detail. To get get them all in one line, change the outer loop to

            for detail in other_details:
                result_i = {
                        'title': titles[index],
                        'lowest_prices': lowest_prices,
                        'store_names': store_names[index],
                        'shipping_prices': shipping_prices[index],
                        'price_per_100_kg': price_per_100_kg,
                    }

                detail_1 = [pr.text.strip() for pr in detail.find_all('span', {'class': 'sp-price'})]
                for idx, price in enumerate(detail_1):
                    result_i[f'lowest_price_{idx}'] = detail_1[idx] 

                self.results.append(result_i)

Then, your csv output will go from to

and you json would look, as expected

{
    "title": "Ziwi Peak Dog Air-Dried Free Range Chicken Recipe 1Kg",
    "lowest_prices": "$57.75",
    "store_names": "Woofers World",
    "shipping_prices": " $9.95 shipping",
    "price_per_100_kg": "$5.78 per 100g",
    "lowest_price_0": "$3.87",
    "lowest_price_1": "$4.49",
    "lowest_price_2": "$5.09",
    "lowest_price_3": "$5.95",
    "lowest_price_4": "$5.99"
}

(printed using print(json.dumps([r for r in scraper.results if len(r) == 10][0], indent=4)))