Home > Mobile >  Could not scrap some values from json file in python
Could not scrap some values from json file in python

Time:01-02

I would like scrap the data from json file, however I could not scrap the availability ("available" in json file) of the json value. The other values are scrapped sucessfully.

It shown blank on the column.

varavailability= "" if i >= len(variants) else variants[i].get('available', '')
import asyncio
import os
import random
import time
import openpyxl
import aiohttp
from urllib import request

# path="C:/Users/pengoul/Downloads/dl" 
path = os.getcwd()
print(f"CWD is {path}")
path = os.path.join(path, "download")
if not os.path.exists(path):
        os.makedirs(path)

# picpath= os.makedirs('picture')
async def request():
    async with aiohttp.ClientSession() as session:
        async with session.get(url='https://hiutdenim.co.uk/products.json?limit=500') as resp:
            html = await resp.json()
            k = list()
            f = openpyxl.Workbook()
            sheet = f.active
            sheet.append(['Name', 'Barcode', 'Product Category', 'Image', 'Internal Reference', 'Sales Price','Product Tags'])

            products = []

            print("Saving to excel ...")
            for i in html['products']:
                title = i.get('title')
                id1 = i.get('id')
                product_type = i.get('product_type')
                images = [img.get('src', '') for img in i.get('images', [])]
                products.append((title, id1, product_type, images))
                variants = [var for var in i.get('variants')]
                for i in range(max(len(images), len(variants))):
                    imgsrc = "" if i >= len(images) else images[i]
                    varsku = "" if i >= len(variants) else variants[i].get('sku', '')
                    varprice = "" if i >= len(variants) else variants[i].get('price', '')
                    varavailability= "" if i >= len(variants) else variants[i].get('available', '')
                    sheet.append([title, "'"   str(id1), product_type, imgsrc, varsku, varprice, varavailability])
                f.save(f"result230102.xlsx")

 print("Downloading images ...")
            for product in products:
                title, id1, product_type, images = product
                for seq, imgurl in enumerate(images):
                    print(f"Downloading img for {id1} ({seq   1}/{len(images)})")
                    request.urlretrieve(imgurl, os.path.join(path, f"{id1}-{seq   1}.jpg"))

async def download(url):
    image = url[0]
    file_name = f'{url[1]}.jpg'
    print(f'picpath/{file_name}')
    async with aiohttp.ClientSession() as session:
        time.sleep(random.random())
        async with session.get(image) as resp:
            with open(path  file_name, mode='wb') as f:
                f.write(await resp.content.read())

#     print(f'picpath/{file_name}')


async def main():
    if not os.path.exists(path):
        os.mkdir(path)
    tasks = []
    await request()
    # for url in urls:
    #     tasks.append(asyncio.create_task(download(url)))
    # await asyncio.wait(tasks)


if __name__ == '__main__':
    print(os.getpid())
    t1 = time.time()
    urls = []
    loop = asyncio.get_event_loop()  
    loop.run_until_complete(main())  
    t2 = time.time()
    print('total:', t2 - t1)

enter image description here

It shown blank on this column.

I would like to scrap the values of "available" from json.

enter image description here

CodePudding user response:

I ran your code in my debugger, putting a breakpoint at the line in question. This breakpoint is hit many times during execution. In some cases, it produces a True value for varavailability as you're expecting.

At some point, this line ends up executing when the value of i is 1 and the length of variants is also 1. In this case, per the if condition if i >= len(variants), the variable varavailability is set to "". i is allowed to have a value of 1 because the length of images in this case is 5. In this case, your loop for i in range(max(len(images), len(variants))): will iterate over i == 0 to i == 4. For each i value greater than 0, varavailability will be set to "". I can't be sure if this is the case you're wondering about, but it makes good sense that it is.

  • Related