I want to code an webcrawler where I need to add links from the pages to the array inside of the pool, but the pool only works with the given urls and doesn't work with the appended links I give it in the def function.
from concurrent import futures
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
def linksSearchAndAppend(url):
req = Request(url)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
if link[0]=="/":
link[0]==""
link=url link
global urls
urls.append(links)
print (urls)
urlListend=open("urlList.txt", "r")
urls=[]
for line in urlListend:
urls.append(line.rstrip())
urlListend.close()
#main multithreading is working
e = futures.ThreadPoolExecutor(max_workers=8)
for url in urls:
e.submit(linksSearchAndAppend, url)
e.shutdown()
CodePudding user response:
from concurrent import futures
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
def linksSearchAndAppend(url):
req = Request(url)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
#print (soup)
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
#if link[0]=="/":
# link[0]==""
# link=url link
global urls
urls.append(links)
print (links)
urlListend=open("urlList.txt", "r")
urls=[]
for line in urlListend:
urls.append(line.rstrip())
urlListend.close()
#main multithreading is working
e = futures.ThreadPoolExecutor(max_workers=8)
for url in urls:
e.submit(linksSearchAndAppend, url)
e.shutdown()
CodePudding user response:
This works but it would still need an "alreadysearchedUrls" array so that it doesn't repeat the search of the already search "urls"
from concurrent import futures
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from urllib.request import urlopen
def linksSearchAndAppend(url):
req = Request(url)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
#print (soup)
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
#if link[0]=="/":
# link[0]==""
# link=url link
global urls
urls.append(links)
print (urls)
urlListend=open("urlList.txt", "r")
urls=[]
for line in urlListend:
urls.append(line.rstrip())
urlListend.close()
#main multithreading is working
for i in urls:
e = futures.ThreadPoolExecutor(max_workers=8)
for url in urls:
e.submit(linksSearchAndAppend, url)
e.shutdown()