I've been trying for a long time to write the results to my file, but since it's a multithreaded task, the files are written in a mixed way
The task that adds the file is in the get_url function
And this fonction is launched via pool.submit(get_url,line)
import requests
from concurrent.futures import ThreadPoolExecutor
import fileinput
from bs4 import BeautifulSoup
import traceback
import threading
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import warnings
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
count_requests = 0
host_error = 0
def get_url(url):
try:
global count_requests
result_request = requests.get(url, verify=False)
soup = BeautifulSoup(result_request.text, 'html.parser')
with open('outfile.txt', 'a', encoding="utf-8") as f:
f.write(soup.title.get_text())
count_requests = count_requests 1
except:
global host_error
host_error = host_error 1
with ThreadPoolExecutor(max_workers=100) as pool:
for line in fileinput.input(['urls.txt']):
pool.submit(get_url,line)
print(str("requests success : ") str(count_requests) str(" | requests error ") str(host_error), end='\r')
This is what the output looks like :
google.com - Google
w3schools.com - W3Schools Online Web Tutorials
CodePudding user response:
You can use multiprocessing.Pool
and pool.imap_unordered
to receive processed results and write it to the file. That way the results are written only inside main thread and won't be interleaved. For example:
import requests
import multiprocessing
from bs4 import BeautifulSoup
def get_url(url):
# do your processing here:
soup = BeautifulSoup(requests.get(url).content, "html.parser")
return soup.title.text
if __name__ == "__main__":
# read urls from file or other source:
urls = ["http://google.com", "http://yahoo.com"]
with multiprocessing.Pool() as p, open("result.txt", "a") as f_out:
for result in p.imap_unordered(get_url, urls):
print(result, file=f_out)
CodePudding user response:
I agree with Andrej Kesely that we should not write to file within get_url
. Here is my approach:
from concurrent.futures import ThreadPoolExecutor, as_completed
def get_url(url):
# Processing...
title = ...
return url, title
if __name__ == "__main__":
with open("urls.txt") as stream:
urls = [line.strip() for line in stream]
with ThreadPoolExecutor() as executor:
urls_and_titles = executor.map(get_url, urls)
# Exiting the with block: all tasks are done
with open("outfile.txt", "w", encoding="utf-8") as stream:
for url, title in urls_and_titles:
stream.write(f"{url},{title}\n")
This approach waits until all tasks completed before writing out the result. If we want to write out the tasks as soon as possible:
from concurrent.futures import ThreadPoolExecutor, as_completed
...
if __name__ == "__main__":
with open("urls.txt") as stream:
urls = [line.strip() for line in stream]
with ThreadPoolExecutor() as executor, open("outfile.txt", "w", encoding="utf-8") as stream:
futures = [
executor.submit(get_url, url)
for url in urls
]
for future in as_completed(futures):
url, title = future.result()
stream.write(f"{url},{title}\n")
The as_completed()
function will take care to order the Futures
object so the ones completed first is at the beginning of the queue.
In conclusion, the key here is for the worker function get_url
to return some value and do not write to file. That task will be done in the main thread.