I have the following soup:
next ... From this I want to extract the href, "some_url"
this I want to extract the href, "some_url"
and the whole list of the pages that are listed on this page: https://www.catholic-hierarchy.org/diocese/laa.html
note: there are a whole lot of links to sub-pages: which i need to parse. at the moment: getting all the data out it : -dioceses -Urls -description -contact-data -etc. etx.
The example below will grab all URLs of dioceses, get some info about each of them and creates final dataframe. To speed-up the process multiprocessing.Pool is used:
but wait: how to get this scraper running without the support of the multiprocessing!? i want to run it in Colab - therefore in need to get rid of the multiprocessing-feature.
How to achieve this..!?
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
def get_dioceses_urls(section_url):
dioceses_urls = set()
while True:
print(section_url)
soup = BeautifulSoup(
requests.get(section_url, headers=headers).content, "lxml"
)
for a in soup.select('ul a[href^="d"]'):
dioceses_urls.add(
"https://www.catholic-hierarchy.org/diocese/" a["href"]
)
# is there Next Page button?
next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
if next_page:
section_url = (
"https://www.catholic-hierarchy.org/diocese/"
next_page["href"]
)
else:
break
return dioceses_urls
def get_diocese_info(url):
print(url)
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")
data = {
"Title 1": soup.h1.get_text(strip=True),
"Title 2": soup.h2.get_text(strip=True),
"Title 3": soup.h3.get_text(strip=True) if soup.h3 else "-",
"URL": url,
}
li = soup.find(
lambda tag: tag.name == "li"
and "type of jurisdiction:" in tag.text.lower()
and tag.find() is None
)
if li:
for l in li.find_previous("ul").find_all("li"):
t = l.get_text(strip=True, separator=" ")
if ":" in t:
k, v = t.split(":", maxsplit=1)
data[k.strip()] = v.strip()
# get other info about the diocese
# ...
return data
if __name__ == "__main__":
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0"
}
# get main sections:
url = "https://www.catholic-hierarchy.org/diocese/laa.html"
soup = BeautifulSoup(
requests.get(url, headers=headers).content, "html.parser"
)
main_sections = [url]
for a in soup.select("a[target='_parent']"):
main_sections.append(
"https://www.catholic-hierarchy.org/diocese/" a["href"]
)
all_data, dioceses_urls = [], set()
with Pool() as pool:
# get all dioceses urls:
for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
dioceses_urls.update(urls)
# get info about all dioceses:
for info in pool.imap_unordered(get_diocese_info, dioceses_urls):
all_data.append(info)
# create dataframe from the info about dioceses
df = pd.DataFrame(all_data).sort_values("Title 1")
# save it to csv file
df.to_csv("data.csv", index=False)
print(df.head().to_markdown())
update: well see what i get back if i run the script on colab:
https://www.catholic-hierarchy.org/diocese/laa.htmlhttps://www.catholic-hierarchy.org/diocese/lab.html
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "<ipython-input-1-f5ea34a0190f>", line 21, in get_dioceses_urls
next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1403, in select_one
value = self.select(selector, limit=1)
File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1528, in select
'Only the following pseudo-classes are implemented: nth-of-type.')
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
"""
The above exception was the direct cause of the following exception:
NotImplementedError Traceback (most recent call last)
<ipython-input-1-f5ea34a0190f> in <module>
81 with Pool() as pool:
82 # get all dioceses urls:
---> 83 for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
84 dioceses_urls.update(urls)
85
/usr/lib/python3.7/multiprocessing/pool.py in next(self, timeout)
746 if success:
747 return value
--> 748 raise value
749
750 __next__ = next # XXX
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
CodePudding user response:
The following is one way of getting that information, in an async fashion (should work on Colab notebooks). I got the dioceses urls from a different part of the site (Structured view - World Regions). I would expect the dioceses count there to match the count from the letters list.
from httpx import Client, AsyncClient, Limits
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
from datetime import datetime
import asyncio
import nest_asyncio
nest_asyncio.apply()
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
big_df_list = []
def all_dioceses():
dioceses = []
root_links = [f'https://www.catholic-hierarchy.org/diocese/qview{x}.html' for x in range(1, 8)]
with Client(headers=headers, timeout=60.0, follow_redirects=True) as client:
for x in root_links:
r = client.get(x)
soup = bs(r.text)
soup.select_one('ul#menu2').decompose()
for link in soup.select('ul > li > a'):
dioceses.append('https://www.catholic-hierarchy.org/diocese/' link.get('href'))
return dioceses
# print(all_dioceses())
async def get_diocese_info(url):
async with AsyncClient(headers=headers, timeout=60.0, follow_redirects=True) as client:
try:
r = await client.get(url)
soup = bs(r.text)
d_name = soup.select_one('h1[align="center"]').get_text(strip=True)
info_table = soup.select_one('div[id="d1"] > table')
d_bishops = ' | '.join([x.get_text(strip=True) for x in info_table.select('td')[0].select('li')])
d_extra_info = ' | '.join([x.get_text(strip=True) for x in info_table.select('td')[1].select('li')])
big_df_list.append((d_name, d_bishops, d_extra_info, url))
print('done', d_name)
except Exception as e:
print(url, e)
async def scrape_dioceses():
start_time = datetime.now()
tasks = asyncio.Queue()
for x in all_dioceses():
tasks.put_nowait(get_diocese_info(x))
async def worker():
while not tasks.empty():
await tasks.get_nowait()
await asyncio.gather(*[worker() for _ in range(100)])
end_time = datetime.now()
duration = end_time - start_time
print('diocese scraping took', duration)
asyncio.run(scrape_dioceses())
df = pd.DataFrame(big_df_list, columns = ['Name', 'Bishops', 'Info', 'Url'])
print(df)
Result in terminal:
done Eparchy of Mississauga (Syro-Malabar)
done Eparchy of Mar Addai of Toronto (Chaldean)
done Eparchy of Saint-Sauveur de Montr�al (Melkite Greek)
done Diocese of Calgary
done Archdiocese of Winnipeg
[...]
diocese scraping took 0:03:02.366096
Name Bishops Info Url
0 Eparchy of Mississauga (Syro-Malabar) JoseKalluvelil, Bishop Type of Jurisdiction: Eparchy | Elevated:22 December2018 | Immediately Subject to the Holy See | Syro-Malabar Catholic Church of the Chaldean Tradition | Country:Canada | Mailing Address: Syro-Malabar Apostolic Exarchate, 6630 Turner Valley Rd., Mississauga, ON L5V 2P1, Canada | Telephone: (905)858-8200 | Fax: 858-8208 https://www.catholic-hierarchy.org/diocese/dmism.html
1 Eparchy of Mar Addai of Toronto (Chaldean) Robert SaeedJarjis, Bishop | Bawai (Ashur)Soro, Bishop Emeritus Type of Jurisdiction: Eparchy | Erected:10 June2011 | Immediately Subject to the Holy See | Chaldean Catholic Church of the Chaldean Tradition | Country:Canada | Conference Region:Ontario | Mailing Address: 2 High Meadow Place, Toronto, ON M9L 2Z5, Canada | Telephone: (416)746-5816 | Fax: 746-5850 https://www.catholic-hierarchy.org/diocese/dtoch.html
2 Eparchy of Saint-Sauveur de Montr�al (Melkite Greek) MiladJawish, B.S., Bishop Type of Jurisdiction: Eparchy | Elevated:1 September1984 | Immediately Subject to the Holy See | Melkite Greek Catholic Church of the Byzantine Tradition | Country:Canada | Conference Region:Quebec | Web Site:http://www.melkite.com/ | Mailing Address: 10025 boul. de l'Arcadie, Montreal, QC H4N 2S1, Canada | Telephone: (514)272.6430 | Fax: 202.1274 https://www.catholic-hierarchy.org/diocese/dmome.html
3 Diocese of Calgary William TerrenceMcGrattan, Bishop | Frederick BernardHenry, Bishop Emeritus Type of Jurisdiction: Diocese | Erected:30 November1912 | Metropolitan: Archdiocese ofEdmonton | Rite: Latin (or Roman) | Province: Alberta | Country:Canada | Square Kilometers: 110,500 (42,680 Square Miles) | Conference Region:West (Ouest) | Catholic Directory Abbreviation: Cal | Official Web Site:http://www.calgarydiocese.ca/ | Mailing Address: Catholic Pastoral Centre, Room 290, The Iona Building, 120-17th Avenue S.W., Calgary, AB T2S 2T2, Canada | Telephone: (403)218-5528 | Fax: 264-0272 https://www.catholic-hierarchy.org/diocese/dcalg.html
4 Archdiocese of Winnipeg Richard JosephGagnon, Archbishop | James VernonWeisgerber, Archbishop Emeritus Type of Jurisdiction: Archdiocese | Erected:4 December1915 | Immediately Subject to the Holy See | Rite: Latin (or Roman) | Province: Manitoba | Country:Canada | Square Kilometers: 116,405 (44,961 Square Miles) | Conference Region:West (Ouest) | Catholic Directory Abbreviation: W | Official Web Site:http://www.archwinnipeg.ca/ | Mailing Address: Chancery Office, 1495 Pembina Highway, Winnipeg, MB R3T 2C6, Canada | Telephone: (204)452-2227 | Fax: 475-4409 https://www.catholic-hierarchy.org/diocese/dwinn.html
... ... ... ... ...
2619 Archiepiscopal Exarchate of Krym (Ukrainian) Vacant | Makariy BohdanLeniv, O.S.B.M., Apostolic Administrator | MykhayloBubniy, C.SS.R., Archiepiscopal Administrator Type of Jurisdiction: Archiepiscopal Exarchate | Split:13 February2014 | Metropolitan: Archeparchy ofKyiv-Halyč {Kiev} (Ukrainian) | Ukrainian Catholic Church of the Byzantine Tradition | Country:Ukraine | Mailing Address: vul. Schmidta 22/12, 65000 Odessa, Ukraina | Telephone: (0482)32.58.90 | Fax: 32.58.89 https://www.catholic-hierarchy.org/diocese/dkrym.html
2620 Diocese of Lutsk VitaliySkomarovskyi, Bishop | MarkijanTrofym’yak, Bishop Emeritus Type of Jurisdiction: Diocese | Split:28 October1925 | Metropolitan: Archdiocese ofLviv | Rite: Latin (or Roman) | Country:Ukraine | Square Kilometers: 40,190 (15,523 Square Miles) | Official Web Site:http://catholic.volyn.ua/ | Mailing Address: Kuria Diecezjalna, vul. Katedralna 17, 43016 Lutsk, Ukraina | Telephone: (0332)72.15.32 | Fax: (same) https://www.catholic-hierarchy.org/diocese/dluts.html
2621 Diocese of Stockholm AndersArborelius, O.C.D., Cardinal, Bishop Type of Jurisdiction: Diocese | Elevated:29 June1953 | Immediately Subject to the Holy See | Rite: Latin (or Roman) | Country:Sweden | Square Kilometers: 450,295 (173,926 Square Miles) | Official Web Site:https://www.katolskakyrkan.se | Mailing Address: Katolska Biskopsambetet, Gotgatan 68, P.O. Box 4114, S-102 62 Stockholm, Sverige | Telephone: (08)462.66.02 | Fax: 702.05.55 https://www.catholic-hierarchy.org/diocese/dstos.html
2622 Archeparchy of Diarbekir (Amida) (Chaldean) RamziGarmou, Ist. del Prado, Archbishop Type of Jurisdiction: Archeparchy | Elevated:3 January1966 | Chaldean Catholic Church of the Chaldean Tradition | Country:Turkey | Mailing Address: Archeveche Chaldeen, Hamalbasi Caddesi 20, Galatasaray, 34435 Beyoglu, Istanbul, Turkiye | Telephone: (0212)252.34.49 | Fax: (same) https://www.catholic-hierarchy.org/diocese/ddiar.html
2623 Eparchy of Kolomyia (Ukrainian) VasylIvasyuk, Bishop Type of Jurisdiction: Eparchy | Split:12 September2017 | Metropolitan: Archeparchy ofIvano-Frankivsk [Stanislaviv] (Ukrainian) | Ukrainian Catholic Church of the Byzantine Tradition | Country:Ukraine | Square Kilometers: 14,000 (5,407 Square Miles) | Official Web Site:https://kolugcc.org.ua | Mailing Address: vul. Ivana Franka 29, 78200 Kolomyia, Ukraina | Telephone: (06891)19.707 https://www.catholic-hierarchy.org/diocese/dkolo.html
2624 rows × 4 columns
As you can see, this code will pull the full info for 2.6k dioceses in approx 3 minutes, while using far less resources than multiprocessing or multithreading.
You will need to install the following (install or upgrade, just run these commands one by one in colab notebook):
pip install -U asyncio
pip install -U nest-asyncio
pip install -U httpx
pip install -U bs4
pip install -U pandas
I also imported re, in case you will want to select the bits of information one by one (Jurisdiction, Tradition, Address, website, and so on), each of them in a try/except block, to account for missing ones, and extend the list/dataframe accordingly. All packages above can be found on https://pypi.org/, and are documented.
CodePudding user response:
problem with running script on google colab is that it currently only supports python 3.7, which doesn't support the newest version of beautifulsoup, so your a:has
operator is not supported, i have replaced it with a loop on all a
tags, which is slightly slower but the code works on google colab, and there is no need to remove multprocessing, but if you do need to remove multiprocessing then you should convert your functions into corountines and run them as tasks using asyncio as suggested by @Barry the Platipus.
def get_dioceses_urls(section_url):
dioceses_urls = set()
while True:
print(section_url)
soup = BeautifulSoup(
requests.get(section_url, headers=headers).content, "lxml"
)
for a in soup.select('ul a[href^="d"]'):
dioceses_urls.add(
"https://www.catholic-hierarchy.org/diocese/" a["href"]
)
# is there Next Page button?
next_page = None
for a in soup.find_all('a'):
if a.img:
if a.img["alt"] == "[Next Page]":
next_page = a
break
if next_page:
section_url = (
"https://www.catholic-hierarchy.org/diocese/"
next_page["href"]
)
else:
break
return dioceses_urls