here is my code ..it is very slow I am not scraping a lot of data. My file size is 188 KB..i think the problem is i am getting all the internal url links from a website and scraping each one to find certain key words and count them. For each internal URL i have to make a request call so I end up scraping more websites than I have in the file.
I am thinking the solution should be deploying it to a virtual computer like AWS or is there anything I could do in my code to make it more efficient?
import re
# import xlwt
import time
import string
import requests
import pandas as pd
from bs4 import BeautifulSoup
from boilerpy3 import extractors
from urllib.parse import urlparse
# from urllib.parse import urljoin
# from sklearn.feature_extraction import text
# from sklearn.feature_extraction.text import CountVectorizer
# downloads web page content
def get_web_page_content(page_url:str) -> object:
response = None
connection_status = True
try:
response = requests.get(page_url)
time.sleep(5)
if response.status_code != 200:
raise Exception('Web page response code is not 200.')
except:
connection_status = False
return response, connection_status
# transforms HTML content into a BeautifulSoup object
def create_beautiful_soup_object(response:object) -> object:
# '''
# Takes a response object and returns a BeautifulSoup object
# '''
bs = BeautifulSoup(response.content, 'html.parser')
return bs
# get page title
def get_page_title(bs:object) -> str:
# '''
# Takes a beautiful soup object and returns the page title
# '''
try:
title = bs.title.text
except AttributeError as e:
return None
return title
# the subdirectory describes the document type
def extract_subdirectory(page_url:str) -> str:
# '''
# Takes a page URL and returns the subdirectory in the URL
# '''
# Defines the regex to match subdirectory in an URL
regex = re.compile(fr"^(?:https:\/\/{page_url}\/|\/)(?P<subdirectory>(\w -?)*)\/")
match_obj = re.search(regex, page_url)
if match_obj:
subdirectory = match_obj.group('subdirectory')
# replace - with a space
rtn_value = subdirectory.replace('-', ' ')
else:
rtn_value = 'n/a'
return rtn_value
def extract_text_data_from_web_page(web_content:str) -> string:
# print("web_content", web_content)
# '''
# Takes a web content as an argument.
# Returns a text string without clutter.
# '''
try:
extractor = extractors.ArticleExtractor()
return extractor.get_content(web_content)
except:
pass
# cleans text data. references: https://medium.com/nwamaka-imasogie/stand-up-comedy-and-nlp-c7d64002520c
def clean_text_data(raw_text:str) -> str:
# print("str", str)
rtn_value = ''
try:
rtn_value = raw_text
# 1. Make text all lower case
rtn_value = rtn_value.lower()
# 2. Remove punctuation
rtn_value = re.sub(r'[{}©]'.format(re.escape(string.punctuation)), '', rtn_value)
# 3. Remove numerical values
rtn_value = re.sub(r'\w*\d \w*', '', rtn_value)
# 4. Remove common non-sensical text
rtn_value = re.sub(r' *\n *', ' ', rtn_value)
except:
pass
return rtn_value
def is_absolute(url):
return bool(urlparse(url).netloc)
# uses CountVectorizer in Python to count word frequencies
def count_word_frequencies(clean_text, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website):
# print('clean_text', clean_text)
word_frequency_dict = dict()
list1 = clean_text.split() # this is your original list of words
print('list1', list1)
word_frequency_dict["name"] = name
word_frequency_dict["turnover"] = turnover
word_frequency_dict["size"] = size
word_frequency_dict["employees"] = employees
word_frequency_dict["address"] = address
word_frequency_dict["town"] = town
word_frequency_dict["postcode"] = postcode
word_frequency_dict["sic_code"] = sic_code
word_frequency_dict["directors"] = directors
word_frequency_dict["telephone"] = telephone
word_frequency_dict["email"] = email
word_frequency_dict["website"] = website
word_frequency_dict["sustainability"] = list1.count('sustainability')
word_frequency_dict["sustainable"] = list1.count('sustainable')
word_frequency_dict["buffet"] = list1.count('buffet')
word_frequency_dict["kitchen"] = list1.count('kitchen')
word_frequency_dict["ISO 14001"] = list1.count('ISO 14001')
word_frequency_dict["b corp"] = list1.count('b corp')
word_frequency_dict["brasserie"] = list1.count('brasserie')
word_frequency_dict["community"] = list1.count('community')
word_frequency_dict["social"] = list1.count('social')
word_frequency_dict["green credentials"] = list1.count('green credentials')
word_frequency_dict["environment"] = list1.count('environment')
word_frequency_dict["environmental"] = list1.count('environmental')
word_frequency_dict["food"] = list1.count('food')
word_frequency_dict["book a table"] = list1.count('book a table')
word_frequency_dict["planet"] = list1.count('planet')
word_frequency_dict["planet earth"] = list1.count('planet earth')
word_frequency_dict["compostable"] = list1.count('compostable')
word_frequency_dict["recyclable"] = list1.count('recyclable')
word_frequency_dict["eco friendly"] = list1.count('eco friendly')
word_frequency_dict["restaurant"] = list1.count('restaurant')
word_frequency_dict["bistro"] = list1.count('bistro')
word_frequency_dict["take away"] = list1.count('take away')
word_frequency_dict["climate emissions"] = list1.count('climate emissions')
word_frequency_dict["coffee"] = list1.count('coffee')
word_frequency_dict["tea"] = list1.count('tea')
word_frequency_dict["menu"] = list1.count('menu')
word_frequency_dict["just eat"] = list1.count('just eat')
word_frequency_dict["uber eat"] = list1.count('uber eat')
word_frequency_dict["deliveroo"] = list1.count('deliveroo')
word_frequency_dict["reservations"] = list1.count('reservations')
word_frequency_dict["corporate social responsibility"] = list1.count('corporate social responsibility')
word_frequency_dict["freshly squeezed juice"] = list1.count('freshly squeezed juice')
word_frequency_dict["breakfast"] = list1.count('breakfast')
word_frequency_dict["lunch"] = list1.count('lunch')
word_frequency_dict["dinner"] = list1.count('dinner')
word_frequency_dict["hotel"] = list1.count('hotel')
word_frequency_dict["csr"] = list1.count('csr')
word_frequency_dict["net zero"] = list1.count('net zero')
word_frequency_dict["carbon offsetting"] = list1.count('carbon offsetting')
word_frequency_dict["type"] = 'hospitality business'
return word_frequency_dict
def get_response_text(internal_links):
content = []
for link in internal_links:
try:
response, status = get_web_page_content(link)
time.sleep(5)
print("response", response)
print("status", status)
if status == True:
text_data = extract_text_data_from_web_page(response.text)
clean_text = clean_text_data(text_data)
print("clean_text", clean_text)
content.append(clean_text)
except:
pass
return content
# finds frequently words in the web page
def get_frequently_words(internal_links, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website):
contents = get_response_text(internal_links)
print("contents", contents)
all_strings = list(map(str, contents))
one_single_text = ' '.join(all_strings)
dict_word_frequency = count_word_frequencies(one_single_text, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website)
return dict_word_frequency
# finds a list of all internal links
def get_internal_links(soup, page_url):
# '''
# Takes a beautiful soup object and returns a list of all Internal links
# '''
# Initialize the return value
rtn_value = list()
# Defines the regex to find internal links. Hardcode the base URL
regex = re.compile(fr'^((https:\/\/)?{page_url}\/|\/). ')
links = soup.find_all('a', {'href': regex})
if len(links) < 1:
rtn_value.append(page_url)
return rtn_value
else:
for link in links:
try:
href = link['href']
if is_absolute(href):
rtn_value.append(href)
else:
rtn_value.append(page_url href)
except AttributeError as e:
pass
if page_url not in rtn_value:
rtn_value.append(page_url)
return rtn_value
if __name__ == '__main__':
df = pd.read_csv('./Endole.csv')
name = df['Name'].to_list()
turnover = df['Turnover'].to_list()
size = df['Size'].to_list()
employees = df['No. Of Employees'].to_list()
address = df['Address'].to_list()
town = df['Post Town'].to_list()
postcode = df['Postcode'].to_list()
sic_code = df['SIC Code'].to_list()
directors = df['Directors'].to_list()
telephone = df['Telephone'].to_list()
email = df['Email Address'].to_list()
website = df['Website'].to_list()
page_urls = df['Website'].to_list()
# print("NAMES", NAMES)
# print("LINKS", LINKS)
for i, page_url in enumerate(page_urls):
# page_url = 'https://www.hotelanacapri.co.uk'
response, status = get_web_page_content(page_url)
print("response", response)
print("status", status)
if status == True:
soup = create_beautiful_soup_object(response)
page_title = get_page_title(soup)
docuemnt_type = extract_subdirectory(page_url)
internal_links = get_internal_links(soup, page_url)
print("internal_links", internal_links)
high_frequency_words = get_frequently_words(internal_links, name[i], turnover[i], size[i], employees[i], address[i], town[i], postcode[i], sic_code[i], directors[i], telephone[i], email[i], website[i])
print("high_frequency_words", high_frequency_words)
new_df = pd.DataFrame(high_frequency_words, index=[0])
new_df.to_csv('testing.csv', mode='a', index=False, header=False)
CodePudding user response:
You can use the standard python package asyncio. Most of the time you spend waiting for a response. Due to the large file size, this takes a long time. Asyncio allows you to run coroutines that will wait for a response. It is worth noting that you cannot use requests module with asyncio, since this library does not support asynchrony. You can look towards aiohttp
Another way is to use multithreading. In Python it doesn't quite work as you would expect due to the GIL, see more here: What is the global interpreter lock (GIL) in CPython? However, Python's multi-threading allows you to run queries in parallel, so that should help you too.
Also you can use multiprocessing. You can run a separate process for each request, in which it will be processed.