I would like scrap the data from json file, however I could not scrap the availability ("available" in json file) of the json value. The other values are scrapped sucessfully.
It shown blank on the column.
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
import asyncio
import os
import random
import time
import openpyxl
import aiohttp
from urllib import request
# path="C:/Users/pengoul/Downloads/dl"
path = os.getcwd()
print(f"CWD is {path}")
path = os.path.join(path, "download")
if not os.path.exists(path):
os.makedirs(path)
# picpath= os.makedirs('picture')
async def request():
async with aiohttp.ClientSession() as session:
async with session.get(url='https://hiutdenim.co.uk/products.json?limit=500') as resp:
html = await resp.json()
k = list()
f = openpyxl.Workbook()
sheet = f.active
sheet.append(['Name', 'Barcode', 'Product Category', 'Image', 'Internal Reference', 'Sales Price','Product Tags'])
products = []
print("Saving to excel ...")
for i in html['products']:
title = i.get('title')
id1 = i.get('id')
product_type = i.get('product_type')
images = [img.get('src', '') for img in i.get('images', [])]
products.append((title, id1, product_type, images))
variants = [var for var in i.get('variants')]
for i in range(max(len(images), len(variants))):
imgsrc = "" if i >= len(images) else images[i]
varsku = "" if i >= len(variants) else variants[i].get('sku', '')
varprice = "" if i >= len(variants) else variants[i].get('price', '')
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
sheet.append([title, "'" str(id1), product_type, imgsrc, varsku, varprice, varavailability])
f.save(f"result230102.xlsx")
print("Downloading images ...")
for product in products:
title, id1, product_type, images = product
for seq, imgurl in enumerate(images):
print(f"Downloading img for {id1} ({seq 1}/{len(images)})")
request.urlretrieve(imgurl, os.path.join(path, f"{id1}-{seq 1}.jpg"))
async def download(url):
image = url[0]
file_name = f'{url[1]}.jpg'
print(f'picpath/{file_name}')
async with aiohttp.ClientSession() as session:
time.sleep(random.random())
async with session.get(image) as resp:
with open(path file_name, mode='wb') as f:
f.write(await resp.content.read())
# print(f'picpath/{file_name}')
async def main():
if not os.path.exists(path):
os.mkdir(path)
tasks = []
await request()
# for url in urls:
# tasks.append(asyncio.create_task(download(url)))
# await asyncio.wait(tasks)
if __name__ == '__main__':
print(os.getpid())
t1 = time.time()
urls = []
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.time()
print('total:', t2 - t1)
It shown blank on this column.
I would like to scrap the values of "available" from json.
CodePudding user response:
I ran your code in my debugger, putting a breakpoint at the line in question. This breakpoint is hit many times during execution. In some cases, it produces a True
value for varavailability
as you're expecting.
At some point, this line ends up executing when the value of i
is 1
and the length of variants
is also 1
. In this case, per the if
condition if i >= len(variants)
, the variable varavailability
is set to ""
. i
is allowed to have a value of 1
because the length of images
in this case is 5
. In this case, your loop for i in range(max(len(images), len(variants))):
will iterate over i
== 0
to i
== 4
. For each i
value greater than 0
, varavailability
will be set to ""
. I can't be sure if this is the case you're wondering about, but it makes good sense that it is.