I am doing a google search using 'from googlesearch import search', I get 50 URLs based on my keyword and I store that in a variable, and then I have to filter the URLs based on keywords and from a text file that has visited URLs from the last google search.
#Open text file of visited URLs
with open("visited_urls\\visited_urls.txt", "r ") as fw:
visited_urls = fw.readlines()
#Remove URLs by words
remove_urls_by_words = ['www.amazon.in', 'facebook.com', 'pepperfry.com', 'wikipedia.org', 'flipkart.com', 'myntra.com', 'pinterest.com', 'google.com']
#getting urls from google search in variable urls.
if not any(remove_urls_by_word in urls for remove_urls_by_word in remove_urls_by_words):
trynew = urls
#Removing URLs by visited URLs
if not any(visited_url in trynew for visited_url in visited_urls):
#writing in New.txt
with open("URL\\" FolderName '.txt', "a") as fw:
fw.writelines("%s\n" % trynew)
#writing in visited.txt
with open("URL\\visited_urls\\visited_urls.txt", "a") as f:
f.writelines("%s\n" % trynew)
Visited URLs text file example:
https://twitter.com/amazon?ref_src=twsrc^google|twcamp^serp|twgr^author
https://twitter.com/amazon/status/1476606565868359687?
https://www.primevideo.com/
https://www.aajtak.in/technology/tech-news/story/amazon-app-quiz-december-31-2021-get-answers-to-these-five-questions-to-win-rs-20000-ttec-1384039-2021-12-31
https://timesofindia.indiatimes.com/gadgets-news/amazon-app-quiz-december-31-2021-get-answers-to-these-five-questions-to-win-rs-30000-in-amazon-pay-balance/articleshow/88600235.cms
https://aws.amazon.com/
https://www.aboutamazon.com/
https://in.linkedin.com/company/amazon
https://www.youtube.com/user/amazon
https://www.crunchbase.com/organization/amazon
Thank You in adv.
CodePudding user response:
Took a stab at it. Hope it helps or points you to a better solution.
from googlesearch import search
class UniqueGoolgeUrlSearch:
def __init__(self, query, file_name):
self.query = query
self.file_name = file_name
self.new_urls = set()
self.search_urls()
self.print_urls()
def search_urls(self):
for url in search(self.query, num_results=3):
self.new_urls.add(url)
self.save_urls()
def save_urls(self):
# check if file exists if not create it
try:
f = open(self.file_name, 'x')
f.close()
except FileExistsError:
pass
# open file and write urls
with open(self.file_name, 'r') as f:
old_urls = set(f.read().splitlines())
old_urls.update(self.new_urls)
with open(self.file_name, 'w') as f:
for url in old_urls:
f.write(url '\n')
def print_urls(self):
print("Saved urls to file: " self.file_name)
if __name__ == '__main__':
search_terms = ['cats', 'dogs']
file_name = 'urls.txt'
for term in search_terms:
print("Searching for: " term)
UniqueGoolgeUrlSearch(term, file_name)
print("\n")
print("Done")