I'm trying to save scraped data
to a csv file. However, I get this following error
TypeError: list indices must be integers or slices, not str. I think the error is coming from this piece of code.
csv_writer.writerow(str(row['url']), str(row['img']), str(row['text']))
Following is the entire code..
import requests
from bs4 import BeautifulSoup
import csv
page_url = 'https://alansimpson.me/python/scrape_sample.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4514.131 Safari/537.36'}
rawpage = requests.get(page_url, headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
content = soup.article
link_list = []
for link in content.find_all('a'):
try:
url = link.get('href')
img = link.get('src')
text = link.span.text
link_list.append([{'url':url, 'img':img, 'text':text}])
except AttributeError:
pass
with open('links.csv', 'w', encoding='utf-8', newline='') as csv_out:
csv_writer = csv.writer(csv_out)
csv_writer.writerow(['url', 'img', 'text'])
for row in link_list:
csv_writer.writerow(str(row['url']), str(row['img']), str(row['text']))
print('All done')
Please Note: The following piece of code creates a file and write's the row
with open('links.csv', 'w', encoding='utf-8', newline='') as csv_out:
csv_writer = csv.writer(csv_out)
csv_writer.writerow(['url', 'img', 'text'])
CodePudding user response:
UPDATE
Using the csv.DictWriter()
:
with open('links.csv', 'w', encoding='utf-8', newline='') as csv_out:
i = csv.DictWriter(csv_out, fieldnames = set().union(*(d.keys() for d in link_list)))
i.writeheader()
i.writerows(link_list)
You could use set().union(*(d.keys() for d in link_list))
to fetch a list of keys from your dicts
or simply pass ['url', 'img', 'text']
as fieldnames
Example
import requests
from bs4 import BeautifulSoup
import csv
page_url = 'https://alansimpson.me/python/scrape_sample.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4514.131 Safari/537.36'}
rawpage = requests.get(page_url, headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
content = soup.article
link_list = []
for link in content.find_all('a'):
try:
url = link.get('href')
img = link.img.get('src')
text = link.span.text
link_list.append({'url':url, 'img':img, 'text':text})
except AttributeError:
pass
with open('links.csv', 'w', encoding='utf-8', newline='') as csv_out:
i = csv.DictWriter(csv_out, fieldnames = set().union(*(d.keys() for d in link_list)))
i.writeheader()
i.writerows(link_list)
print('All done')
Alternativ approaches:
Store your data simply as dict
and not as list
with dict
in list
:
link_list.append({'url':url, 'img':img, 'text':text})
and write it like:
csv_writer.writerow([row['url'], row['img'], row['text']])
Or even simpler save it directly as list
:
link_list.append([url,img,text])
and write it as list:
csv_writer.writerow(row)
Example
import requests
from bs4 import BeautifulSoup
import csv
page_url = 'https://alansimpson.me/python/scrape_sample.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4514.131 Safari/537.36'}
rawpage = requests.get(page_url, headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
content = soup.article
link_list = []
for link in content.find_all('a'):
try:
url = link.get('href')
img = link.img.get('src')
text = link.span.text
link_list.append({'url':url, 'img':img, 'text':text})
except AttributeError:
pass
with open('links.csv', 'w', encoding='utf-8', newline='') as csv_out:
csv_writer = csv.writer(csv_out)
csv_writer.writerow(['url', 'img', 'text'])
for row in link_list:
csv_writer.writerow([row['url'], row['img'], row['text']])
print('All done')
CodePudding user response:
Bug Fix :
replace this line img = link.get('src')
to img = link.img.get('src')
Updated Code :
import requests
from bs4 import BeautifulSoup
import pandas as pd
page_url = 'https://alansimpson.me/python/scrape_sample.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4514.131 Safari/537.36'}
rawpage = requests.get(page_url, headers=headers)
soup = BeautifulSoup(rawpage.content, 'html5lib')
content = soup.article
link_list = []
for link in content.find_all('a'):
try:
url = link.get('href')
img = link.img.get('src')
text = link.span.text
link_list.append({
'url':url,
'img':img,
'text':text,
})
except AttributeError:
pass
df = pd.DataFrame(link_list)
print(df)