I am a noob who is trying to scrape a list of urls and search for a word using asynchronous programming in python. My code is as follows:
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
def parse(wd, html, url):
add_soup = bsoup(html,'html.parser')
res = []
for para in (add_soup.find_all("p")):
para_txt = para.text
for sent_txt in para_txt.split("."):
if wd in sent_txt:
res.append([sent_txt, url])
return res
async def scrape_urls(wd, urls):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(
*(fetch_and_parse(wd, session, url) for url in urls)
)
async def fetch_and_parse(wd, session, url):
html = await fetch(wd, session, url)
loop = asyncio.get_event_loop()
paras = await loop.run_in_executor(None, parse, html)
return paras
I wrote the above code from this link. But I am unclear as how to proceed to retrieve the resultant list
I am trying to get the results using this co = scrape_urls("agriculture", urls)
. As expected I get a coroutine object. How do I parse the coroutine object?
CodePudding user response:
Not entirely sure what issue you're facing. Once you use gather
to get the Future instance, use an event loop to execute it and get results.
loop = asyncio.get_event_loop()
group = scrape_urls("agriculture", urls)
results = loop.run_until_complete(group)
loop.close()
print(results)