I try to parse site and have faiced with the next problem. I`m sure that maximum ammount of images of each goods is 7. Each image link write to list. And then save it in excell. So each link have the column like in file 1.xlsx. But there are some goods that have 3 or 5 images. So if the ammount of images less then 7, I want to fill the another field with empty string. But I get the result like in file 2.xlsx. Please, help me to fix that problem.
from datetime import datetime, timedelta
from time import sleep
import time, csv
from csv import reader
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import requests, json
def get_html(url):
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get(url, headers=headers).content
return r
goods_link = ['https://www.johnlewis.com/a-a-k-s-hana-raffia-cross-body-bag-navy-multi/p5559710']
Images1 = []
Images2 = []
Images3 = []
Images4 = []
Images5 = []
Images6 = []
Images7 = []
Img = []
for i in goods_link:
soup = BeautifulSoup(get_html(i), 'html.parser')
imgContainer = soup.find('div', {'class':'ProductImages_productImagesContainer__1v2kP'})
imgAll = imgContainer.find_all('div', {'class':'ImageMagnifier_zoomable-image-container__db7jH'})
for j in imgAll:
imgSrc = j.find('img').get('src').split('?$rsp')[0]
Img.append(imgSrc)
[x.append(y) for x,y in zip([Images1, Images2, Images3, Images4, Images5, Images6, Images7], Img)]
info = {}
for ii in Images1:
info.setdefault('Images1',[])
info['Images1'].append(ii)
for ii in Images2:
info.setdefault('Images2',[])
info['Images2'].append(ii)
for ii in Images3:
info.setdefault('Images3',[])
info['Images3'].append(ii)
for ii in Images4:
info.setdefault('Images4',[])
info['Images4'].append(ii)
for ii in Images5:
info.setdefault('Images5',[])
info['Images5'].append(ii)
for ii in Images6:
info.setdefault('Images6',[])
info['Images6'].append(ii)
for ii in Images7:
info.setdefault('Images7',[])
info['Images7'].append(ii)
df = pd.DataFrame.from_dict(info)
df.to_excel('./output.xlsx')
print('Finish')
CodePudding user response:
IIUC you want to fill all 7 columns for each row, even if this row has less than 7 images.
The step of creating a dictionary is superflous. You can list all you images in lists that you append in a list of lists and create your DataFrame from it.
You can specify the headers with columns=
:
def get_html(url):
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get(url, headers=headers).content
return r
goods_link = ['https://www.johnlewis.com/a-a-k-s-hana-raffia-cross-body-bag-navy-multi/p5559710']
headers = ["Images1", "Images2", "Images3", "Images4", "Images5", "Images6", "Images7"]
img_table = []
for link in goods_link:
img_row = [None]*7
soup = BeautifulSoup(get_html(link), 'html.parser')
imgContainer = soup.find('div', {'class':'ProductImages_productImagesContainer__1v2kP'})
imgAll = imgContainer.find_all('div', {'class':'ImageMagnifier_zoomable-image-container__db7jH'})
for j, div_obj in enumerate(imgAll):
imgSrc = div_obj.find('img').get('src').split('?$rsp')[0]
img_row[j]=imgSrc
img_table.append(img_row)
df = pd.DataFrame(img_table, columns=headers)
df.to_excel('./output.xlsx')
print('Finish')
What was missing was to create a list of None
with length 7, then using enumerate
to replace the element at index j
with the corresponding link.
Please try to name your variables in a way that makes the code easier to understand next time.