I have written a function for parsing news articles feeds.
def save_new_articles(feed, source_id, category_id):
channel_feed_title = feed.channel.title.title()
channel_feed_link = feed.channel.link
channel_feed_desc = feed.channel.description
official_source_id = source_id
post_category_id = category_id
for item in feed.entries:
parsed_summary = item.summary
soup = BeautifulSoup(parsed_summary, 'lxml')
images = soup.findAll('img')
for image in images:
image_url_link = (image['src'])
if image_url_link is not None:
image_link = image_url_link
else:
image_link = "https://www.publicdomainpictures.net/pictures/280000/velka/not-found-image-15383864787lu.jpg"
parsed_title = item.title
formatted = re.sub("<.*?>", "", parsed_title)
post_title = formatted
post_link = item.link
description = item.description
output_summary = re.sub("<.*?>", "", description)
title = item.title
capital = title.title()
tags = capital.split()
date_published = parser.parse(item.published)
if not Posts.objects.filter(guid=item.guid).exists():
post = Posts(
title = post_title,
link = post_link,
summary = output_summary,
image_url = image_link,
tags = tags,
pub_date = date_published,
guid = item.guid,
feed_title = channel_feed_title,
feed_link = channel_feed_link,
feed_description = channel_feed_desc,
source_id = official_source_id,
category_id = post_category_id
)
post.save()
else:
logger.info("Duplicate Post Detected! Skipping...")
But upon running the code I get:
image_url = image_link,
UnboundLocalError: local variable 'image_link' referenced before assignment
I don't understand where the error is coming from seeing as I had defined image_link
in the image for loop statement
above. I have checked similar answers on SO but I don't seem to find a suitable answer. Please help me debug this.
CodePudding user response:
This error happens when images
(which you initialise with soup.findAll('img')
) are empty.
Possibly you don't only have item.summary
without img
tags, but it also could be just empty due to some previous error.
Thereby to fix your code you should just init image_link
before this:
if not Posts.objects.filter(guid=item.guid).exists():
with your, as I assume, default value:
image_link = "https://www.publicdomainpictures.net/pictures/280000/velka/not-found-image-15383864787lu.jpg"
For example, like this, instead of:
for image in images:
image_url_link = (image['src'])
if image_url_link is not None:
image_link = image_url_link
else:
image_link = "https://www.publicdomainpictures.net/pictures/280000/velka/not-found-image-15383864787lu.jpg"
you can go this way:
image_link = "https://www.publicdomainpictures.net/pictures/280000/velka/not-found-image-15383864787lu.jpg"
for image in images:
image_url_link = (image['src'])
if image_url_link is not None:
image_link = image_url_link