import requests
from bs4 import BeautifulSoup
url = 'https://www.isitwp.com/hosting-reviews/'
r = requests.get(url)
soup = BeautifulSoup(r.content, features='lxml')
items = soup.find_all('div', class_ = 'entry-content')
for item in items:
productlist = []
all_links = []
all_coupon = []
all_review = []
# FIND ALL LINKS ON PAGE
all_links_counter = 0
for link in soup.find_all('a', href=True):
current_link = link['href']
all_links.append(current_link)
# Remove Duplicate Urls
s = []
for i in all_links:
if i not in s:
all_links_counter = all_links_counter 1
s.append(i)
#print('All Links Found:', all_links_counter, ' times with Urls:', all_links)
# FIND URL CONTAINING: coupon
coupon_counter = 0
for link in soup.find_all('a', href=True):
if 'coupon' in link.get('href'):
coupon = link.get('href')
all_coupon.append(coupon)
# Remove Duplicate Urls
s = []
for i in all_coupon:
if i not in s:
coupon_counter = coupon_counter 1
s.append(i)
print('coupon Found:', coupon_counter, ' times with Urls:', all_coupon)
# FIND URL CONTAINING: review
review_counter = 0
for link in soup.find_all('a', href=True):
if 'review' in link.get('href'):
review = link.get('href')
all_review.append(review)
# Remove Duplicate Urls
s = []
for i in all_review:
if i not in s:
review_counter = review_counter 1
s.append(i)
print('review Found:', review_counter, ' times with Urls:', all_review)
print()
product = {
'All Links Counter': all_links_counter,
'All Links': ', '.join(all_links),
'coupon Counter': coupon_counter,
'coupon Links': ', '.join(all_coupon),
'review Counter': review_counter,
'review Links': ', '.join(all_review),
}
productlist.append(product)
print('Product List:\n',product)
The above script scrapes the embedded page for URLs and saves the results as follows:
- all_links - all links on the page
- all_coupon - all links that contain the word coupon
- all_review - all links that contain the word review
I now need a way to identify the balance URLs as below:
balance_urls = all_links - (all_coupon all_review)
Any help would be greatly appreciated.
CodePudding user response:
Use set difference:
balance_urls = list(set(all_links).difference(set(all_coupon all_review)))
Or list comprehension if you want to preserve duplicates:
balance_urls = [l for l in all_links if l not in (all_coupon all_review)]