I am trying to webscrape this link in Python. The ideal output is a dataframe with 4 columns: date, author, title and text. So far, I got down to author, title and date in the following way:
from bs4 import BeautifulSoup
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
print(req)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
date = card.select('.item_date')
title = card.select_one('.title a').get_text()
author = card.select_one('.authorlnk.dashed').get_text().strip()
data.append({
'date': date,
'title':title,
'author':author
})
print(data)
Now, I find hard to extract the text for each of the 10 links in the page. I am doing the following:
data = []
for link in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{link['href']}").content,features="lxml")
data.append({
'Text': ''.join([str(e) for e in r.select('p')])})
However, I am not getting any good results around that code.
Can anyone help me with that?
Thanks!
CodePudding user response:
You are close to your goal, simply handle the requests to the texts in your for loop:
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
Example
from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('\n\n', strip=True)
})
pd.DataFrame(data)
CodePudding user response:
You can use .get_text()
with separator=
parameter. For example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
api_url = "https://www.bis.org/doclist/cbspeeches.htm"
payload = {
"from": "",
"till": "",
"objid": "cbspeeches",
"page": "1",
"paging_length": "25",
"sort_list": "date_desc",
"theme": "cbspeeches",
"ml": "false",
"mlurl": "",
"emptylisttext": "",
}
all_data = []
for payload["page"] in range(1, 3):
soup = BeautifulSoup(
requests.post(api_url, data=payload).content, "html.parser"
)
for row in soup.select(".item"):
date = row.select_one(".item_date").get_text(strip=True)
author = row.select_one(".authorlnk").get_text(strip=True)
title = row.a.get_text(strip=True)
text = row.select_one(".info").get_text(strip=True, separator=" ")
all_data.append((date, author, title, text))
df = pd.DataFrame(all_data, columns=["Date", "Author", "Title", "Text"])
print(df.head(5).to_markdown(index=False))
Prints:
Date | Author | Title | Text |
---|---|---|---|
08 Jul 2022 | Lael Brainard | Lael Brainard: Crypto-assets and decentralized finance through a financial stability lens | Speech by Ms Lael Brainard, Member of the Board of Governors of the Federal Reserve System, at the Bank of England Conference, London , 8 July 2022. by Lael Brainard |
08 Jul 2022 | Sam Woods | Sam Woods: Solvency II - striking the balance | Speech (virtual) by Mr Sam Woods , Deputy Governor for Prudential Regulation of the Bank of England and Chief Executive of the Prudential Regulation Authority (PRA), at the Bank of England Webinar, 8 July 2022. by Sam Woods |
08 Jul 2022 | Pablo Hernández de Cos | Pablo Hernández de Cos: 1st Bank of Spain conference on the Spanish economy | Welcome address by Mr Pablo Hernández de Cos, Governor of the Bank of Spain, at the 1st Bank of Spain Conference on the Spanish Economy, Madrid, 7 July 2022. by Pablo Hernández de Cos |
08 Jul 2022 | Mário Centeno | Mário Centeno: Keynote intervention - 26th Economist Government Roundtable | Keynote intervention (virtual) by Mr Mário Centeno, Governor of the Banco de Portugal, at the 26th Economist Government Roundtable, 7 July 2022. by Mário Centeno |
08 Jul 2022 | Pan Gongsheng | Pan Gongsheng: Speech - Fifth Anniversary Forum of Bond Connect and the launch ceremony of Swap Connect | Speech (virtual) by Mr Pan Gongsheng, Deputy Governor of the People's Bank of China, at the Fifth Anniversary Forum of Bond Connect and the Launch Ceremony of Swap Connect, 4 July 2022. by Pan Gongsheng |