web scraper too slow (what can i do to speed up my scraper?)-CodePudding

here is my code ..it is very slow I am not scraping a lot of data. My file size is 188 KB..i think the problem is i am getting all the internal url links from a website and scraping each one to find certain key words and count them. For each internal URL i have to make a request call so I end up scraping more websites than I have in the file.

I am thinking the solution should be deploying it to a virtual computer like AWS or is there anything I could do in my code to make it more efficient?


import re
# import xlwt
import time
import string
import requests
import pandas as pd
from bs4 import BeautifulSoup
from boilerpy3 import extractors
from urllib.parse import urlparse
# from urllib.parse import urljoin
# from sklearn.feature_extraction import text
# from sklearn.feature_extraction.text import CountVectorizer

# downloads web page content
def get_web_page_content(page_url:str) -> object:
    response = None
    connection_status = True
    try:
        response = requests.get(page_url)
        time.sleep(5)
        if response.status_code != 200:
            raise Exception('Web page response code is not 200.')
    except:
        connection_status = False
    return response, connection_status

# transforms HTML content into a BeautifulSoup object
def create_beautiful_soup_object(response:object) -> object:
    # '''
    # Takes a response object and returns a BeautifulSoup object
    # '''
    bs = BeautifulSoup(response.content, 'html.parser')
    return bs

# get page title
def get_page_title(bs:object) -> str:
    # '''
    # Takes a beautiful soup object and returns the page title
    # '''
    try:
        title = bs.title.text
    except AttributeError as e:
        return None
    return title

# the subdirectory describes the document type
def extract_subdirectory(page_url:str) -> str:
    # '''
    # Takes a page URL and returns the subdirectory in the URL
    # '''
    # Defines the regex to match subdirectory in an URL
    regex = re.compile(fr"^(?:https:\/\/{page_url}\/|\/)(?P<subdirectory>(\w -?)*)\/")
    match_obj = re.search(regex, page_url)
    if match_obj:
        subdirectory = match_obj.group('subdirectory')
        # replace - with a space
        rtn_value = subdirectory.replace('-', ' ')
    else:
        rtn_value = 'n/a'
    return rtn_value

def extract_text_data_from_web_page(web_content:str) -> string:
    # print("web_content", web_content)
    # '''
    # Takes a web content as an argument.
    # Returns a text string without clutter.
    # '''
    try:
        extractor = extractors.ArticleExtractor()
        return extractor.get_content(web_content)
    except:
        pass


# cleans text data. references: https://medium.com/nwamaka-imasogie/stand-up-comedy-and-nlp-c7d64002520c
def clean_text_data(raw_text:str) -> str:
    # print("str", str)
    rtn_value = ''

    try:
        rtn_value = raw_text
        # 1.    Make text all lower case
        rtn_value = rtn_value.lower()
        # 2.    Remove punctuation
        rtn_value = re.sub(r'[{}©]'.format(re.escape(string.punctuation)), '', rtn_value)
        # 3.    Remove numerical values
        rtn_value = re.sub(r'\w*\d \w*', '', rtn_value)
        # 4.    Remove common non-sensical text
        rtn_value = re.sub(r' *\n *', ' ', rtn_value)
    except:
        pass

    return rtn_value


def is_absolute(url):
    return bool(urlparse(url).netloc)

# uses CountVectorizer in Python to count word frequencies
def count_word_frequencies(clean_text, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website):
    # print('clean_text', clean_text)

    word_frequency_dict = dict()

    list1 = clean_text.split()  # this is your original list of words
    print('list1', list1)
    word_frequency_dict["name"] = name
    word_frequency_dict["turnover"] = turnover
    word_frequency_dict["size"] = size
    word_frequency_dict["employees"] = employees
    word_frequency_dict["address"] = address
    word_frequency_dict["town"] = town
    word_frequency_dict["postcode"] = postcode
    word_frequency_dict["sic_code"] = sic_code
    word_frequency_dict["directors"] = directors
    word_frequency_dict["telephone"] = telephone
    word_frequency_dict["email"] = email
    word_frequency_dict["website"] = website

    word_frequency_dict["sustainability"] = list1.count('sustainability')
    word_frequency_dict["sustainable"] = list1.count('sustainable')
    word_frequency_dict["buffet"] = list1.count('buffet')
    word_frequency_dict["kitchen"] = list1.count('kitchen')
    word_frequency_dict["ISO 14001"] = list1.count('ISO 14001')
    word_frequency_dict["b corp"] = list1.count('b corp')
    word_frequency_dict["brasserie"] = list1.count('brasserie')
    word_frequency_dict["community"] = list1.count('community')
    word_frequency_dict["social"] = list1.count('social')
    word_frequency_dict["green credentials"] = list1.count('green credentials')
    word_frequency_dict["environment"] = list1.count('environment')
    word_frequency_dict["environmental"] = list1.count('environmental')
    word_frequency_dict["food"] = list1.count('food')
    word_frequency_dict["book a table"] = list1.count('book a table')
    word_frequency_dict["planet"] = list1.count('planet')
    word_frequency_dict["planet earth"] = list1.count('planet earth')
    word_frequency_dict["compostable"] = list1.count('compostable')
    word_frequency_dict["recyclable"] = list1.count('recyclable')
    word_frequency_dict["eco friendly"] = list1.count('eco friendly')
    word_frequency_dict["restaurant"] = list1.count('restaurant')
    word_frequency_dict["bistro"] = list1.count('bistro')
    word_frequency_dict["take away"] = list1.count('take away')
    word_frequency_dict["climate emissions"] = list1.count('climate emissions')
    word_frequency_dict["coffee"] = list1.count('coffee')
    word_frequency_dict["tea"] = list1.count('tea')
    word_frequency_dict["menu"] = list1.count('menu')
    word_frequency_dict["just eat"] = list1.count('just eat')
    word_frequency_dict["uber eat"] = list1.count('uber eat')
    word_frequency_dict["deliveroo"] = list1.count('deliveroo')
    word_frequency_dict["reservations"] = list1.count('reservations')
    word_frequency_dict["corporate social responsibility"] = list1.count('corporate social responsibility')
    word_frequency_dict["freshly squeezed juice"] = list1.count('freshly squeezed juice')
    word_frequency_dict["breakfast"] = list1.count('breakfast')
    word_frequency_dict["lunch"] = list1.count('lunch')
    word_frequency_dict["dinner"] = list1.count('dinner')
    word_frequency_dict["hotel"] = list1.count('hotel')
    word_frequency_dict["csr"] = list1.count('csr')
    word_frequency_dict["net zero"] = list1.count('net zero')
    word_frequency_dict["carbon offsetting"] = list1.count('carbon offsetting')
    word_frequency_dict["type"] = 'hospitality business'

    return word_frequency_dict


def get_response_text(internal_links):
    content = []

    for link in internal_links:
        try:
            response, status = get_web_page_content(link)
            time.sleep(5)
            print("response", response)
            print("status", status)
            if status == True:
                text_data = extract_text_data_from_web_page(response.text)
                clean_text = clean_text_data(text_data)
                print("clean_text", clean_text)

                content.append(clean_text)
        except:
            pass
    return content


# finds frequently words in the web page
def get_frequently_words(internal_links, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website):
    contents = get_response_text(internal_links)
    print("contents", contents)
    all_strings = list(map(str, contents))
    one_single_text = ' '.join(all_strings)
    dict_word_frequency = count_word_frequencies(one_single_text, name, turnover, size, employees, address, town, postcode, sic_code, directors, telephone, email, website)
    return dict_word_frequency

# finds a list of all internal links
def get_internal_links(soup, page_url):
    # '''
    # Takes a beautiful soup object and returns a list of all Internal links
    # '''
    # Initialize the return value
    rtn_value = list()
    # Defines the regex to find internal links. Hardcode the base URL
    regex = re.compile(fr'^((https:\/\/)?{page_url}\/|\/). ')
    links = soup.find_all('a', {'href': regex})

    if len(links) < 1:
        rtn_value.append(page_url)
        return rtn_value
    else:
        for link in links:
            try:
                href = link['href']

                if is_absolute(href):
                    rtn_value.append(href)
                else:
                    rtn_value.append(page_url   href)

            except AttributeError as e:
                pass

        if page_url not in rtn_value:
            rtn_value.append(page_url)

        return rtn_value


if __name__ == '__main__':
    df = pd.read_csv('./Endole.csv')

    name = df['Name'].to_list()
    turnover = df['Turnover'].to_list()
    size = df['Size'].to_list()
    employees = df['No. Of Employees'].to_list()
    address = df['Address'].to_list()
    town = df['Post Town'].to_list()
    postcode = df['Postcode'].to_list()
    sic_code = df['SIC Code'].to_list()
    directors = df['Directors'].to_list()
    telephone = df['Telephone'].to_list()
    email = df['Email Address'].to_list()
    website = df['Website'].to_list()

    page_urls = df['Website'].to_list()
    # print("NAMES", NAMES)
    # print("LINKS", LINKS)
    for i, page_url in enumerate(page_urls):
        # page_url = 'https://www.hotelanacapri.co.uk'
        response, status = get_web_page_content(page_url)
        print("response", response)
        print("status", status)
        if status == True:
            soup = create_beautiful_soup_object(response)
            page_title = get_page_title(soup)
            docuemnt_type = extract_subdirectory(page_url)
            internal_links = get_internal_links(soup, page_url)
            print("internal_links", internal_links)
            high_frequency_words = get_frequently_words(internal_links, name[i], turnover[i], size[i], employees[i], address[i], town[i], postcode[i], sic_code[i], directors[i], telephone[i], email[i], website[i])
            print("high_frequency_words", high_frequency_words)
            new_df = pd.DataFrame(high_frequency_words, index=[0])
            new_df.to_csv('testing.csv', mode='a', index=False, header=False)

CodePudding user response：

You can use the standard python package asyncio. Most of the time you spend waiting for a response. Due to the large file size, this takes a long time. Asyncio allows you to run coroutines that will wait for a response. It is worth noting that you cannot use requests module with asyncio, since this library does not support asynchrony. You can look towards aiohttp

Another way is to use multithreading. In Python it doesn't quite work as you would expect due to the GIL, see more here: What is the global interpreter lock (GIL) in CPython? However, Python's multi-threading allows you to run queries in parallel, so that should help you too.

Also you can use multiprocessing. You can run a separate process for each request, in which it will be processed.