I'm building a web scraper and I'm able to print all he data I need, but I'm struggling adding the data to my csv file, I feel like I need to add another for loop or even a function. Currently I'm able to get it to print one row of scraped data values, but it skips the 64 other rows of data values.
So far I've tried to put in another for loop and break up each variable into it's own function, but it just breaks my code, Here's what I have so far, I feel like I'm just missing something too.
#Gets listing box
listingBox = searchGrid.find_elements(By.CLASS_NAME, 'v2-listing-card')
#Loops through each listing box
for listingBoxes in listingBox:
listingUrl = []
listingImg = []
listingTitle = []
listingPrice = []
#Gets listing url
listingUrl = listingBoxes.find_element(By.CSS_SELECTOR, 'a.listing-link')
print("LISTING URL:", listingUrl.get_attribute('href'))
#Gets listing image
listingImg = listingBoxes.find_element(By.CSS_SELECTOR, 'img.wt-position-absolute')
print("IMAGE:", listingImg.get_attribute('src'))
#Gets listing title
listingTitle = listingBoxes.find_element(By.CLASS_NAME, 'wt-text-caption')
print("TITLE:", listingTitle.text)
#Gets price
listingPrice = listingBoxes.find_element(By.CLASS_NAME, 'currency-value')
print("ITEM PRICE: $", listingPrice.get_attribute("innerHTML"))
#Gets seller name
# listingSellerName = listingBoxes.find_element(By.XPATH, '/html/body/main/div/div[1]/div/div[3]/div[8]/div[2]/div[10]/div[1]/div/div/ol/li/div/div/a[1]/div[2]/div[2]/span[3]')
# print("SELLER NAME:", listingSellerName.get_attribute("innerHTML"))
print("---------------")
finally:
driver.quit()
data = {'Listing URL': listingUrl, 'Listing Thumbnail': listingImg,'Listing Title': listingTitle, 'Listing Price': listingPrice}
df = pd.DataFrame.from_dict(data, orient='index')
df = df.transpose()
df.to_csv('raw_data.csv')
print('Data has been scrapped and added.')
CodePudding user response:
In your code each loop reset the lists listingUrl
, listingImg
etc that's why df
contains only one row of scraped data, corresponding to the last loop executed. If you want to add elements to a list you have to define the list BEFORE the loop and then use the .append()
method inside the loop.
Then, instead of doing listingUrl.get_attribute('href')
you will do listingUrl[-1].get_attribute('href')
where [-1]
means that you are taking the last element of the list.
listingUrl = []
listingImg = []
listingTitle = []
listingPrice = []
for listingBoxes in listingBox:
#Gets listing url
listingUrl.append( listingBoxes.find_element(By.CSS_SELECTOR, 'a.listing-link') )
print("LISTING URL:", listingUrl[-1].get_attribute('href'))
#Gets listing image
listingImg.append( listingBoxes.find_element(By.CSS_SELECTOR, 'img.wt-position-absolute') )
print("IMAGE:", listingImg[-1].get_attribute('src'))
#Gets listing title
listingTitle.append( listingBoxes.find_element(By.CLASS_NAME, 'wt-text-caption') )
print("TITLE:", listingTitle[-1].text)
#Gets price
listingPrice.append( listingBoxes.find_element(By.CLASS_NAME, 'currency-value') )
print("ITEM PRICE: $", listingPrice[-1].get_attribute("innerHTML"))