Home > Enterprise >  Remove URLs from search result based on word list and from visited URLs text file and write in 2 fil
Remove URLs from search result based on word list and from visited URLs text file and write in 2 fil

Time:01-03

I am doing a google search using 'from googlesearch import search', I get 50 URLs based on my keyword and I store that in a variable, and then I have to filter the URLs based on keywords and from a text file that has visited URLs from the last google search.

    
    #Open text file of visited URLs
    with open("visited_urls\\visited_urls.txt", "r ") as fw:
        visited_urls = fw.readlines()
    #Remove URLs by words
    remove_urls_by_words = ['www.amazon.in', 'facebook.com', 'pepperfry.com', 'wikipedia.org', 'flipkart.com', 'myntra.com', 'pinterest.com', 'google.com']
        #getting urls from google search in variable urls.
        if not any(remove_urls_by_word in urls for remove_urls_by_word in remove_urls_by_words):
            trynew = urls
            #Removing URLs by visited URLs
            if not any(visited_url in trynew for visited_url in visited_urls):
                #writing in New.txt
                with open("URL\\"   FolderName   '.txt', "a") as fw:
                     fw.writelines("%s\n" % trynew)
                #writing in visited.txt
                with open("URL\\visited_urls\\visited_urls.txt", "a") as f:
                     f.writelines("%s\n" % trynew)

Visited URLs text file example:

https://twitter.com/amazon?ref_src=twsrc^google|twcamp^serp|twgr^author
https://twitter.com/amazon/status/1476606565868359687?
https://www.primevideo.com/
https://www.aajtak.in/technology/tech-news/story/amazon-app-quiz-december-31-2021-get-answers-to-these-five-questions-to-win-rs-20000-ttec-1384039-2021-12-31
https://timesofindia.indiatimes.com/gadgets-news/amazon-app-quiz-december-31-2021-get-answers-to-these-five-questions-to-win-rs-30000-in-amazon-pay-balance/articleshow/88600235.cms
https://aws.amazon.com/
https://www.aboutamazon.com/
https://in.linkedin.com/company/amazon
https://www.youtube.com/user/amazon
https://www.crunchbase.com/organization/amazon

Thank You in adv.

CodePudding user response:

Took a stab at it. Hope it helps or points you to a better solution.

from googlesearch import search


class UniqueGoolgeUrlSearch:
    def __init__(self, query, file_name):
        self.query = query
        self.file_name = file_name
        self.new_urls = set()
        self.search_urls()
        self.print_urls()

    def search_urls(self):
        for url in search(self.query, num_results=3):
            self.new_urls.add(url)
        self.save_urls()

    def save_urls(self):
        # check if file exists if not create it
        try:
            f = open(self.file_name, 'x')
            f.close()
        except FileExistsError:
            pass
        # open file and write urls
        with open(self.file_name, 'r') as f:
            old_urls = set(f.read().splitlines())
        old_urls.update(self.new_urls)
        with open(self.file_name, 'w') as f:
            for url in old_urls:
                f.write(url   '\n')

    def print_urls(self):
        print("Saved urls to file: "   self.file_name)


if __name__ == '__main__':
    search_terms = ['cats', 'dogs']
    file_name = 'urls.txt'
    for term in search_terms:
        print("Searching for: "   term)
        UniqueGoolgeUrlSearch(term, file_name)
        print("\n")
    print("Done")
  • Related