I am extracting the data they give repeat name
and surname
in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
wev={}
for page in range(1, 5):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
details=soup.find('table',class_="table")
for detail in details.find_all('tbody'):
link = [up.text for up in detail.find_all("td")]
name=link[0]
wev['Nombre']=name
surname=link[1]
wev["Apellidos"]=surname
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
wev[" Dirección"]= Dirección
Población = data[3]
Población = Población[14:]
wev[" Población"]= Población
Provincia = data[4]
Provincia = Provincia[14:]
wev["Provincia "]=Provincia
Teléfono = data[5]
Teléfono = " " Teléfono[11:].replace(".", "")
Teléfono= Teléfono.replace("-", '')
wev[" Teléfono"]= Teléfono
Email = data[6]
Email = Email[10:]
wev["Email"]= Email
temp.append(wev)
df = pd.DataFrame(temp)
print(df)
They will print same name
and surname
in each entry how I correct it these is output
Nombre Apellidos
0 JUAN ARIAS BARTOLOMÉ
1 JUAN ARIAS BARTOLOM
CodePudding user response:
One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = 1
data1 = []
data2 = []
while True:
print(f"Page {page}")
r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
page = 1
soup = BeautifulSoup(r.content, "lxml")
for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
values = [re.sub(r'\s ', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
row = {'Sobre' : values[0][6:]} # skip over the word Sobre
for item in values[2:]:
key, value = item.split(':', 1)
row[key.strip()] = value.strip()
row['Teléfono'] = row['Teléfono'].replace(".", "")
data1.append(row)
details = soup.find("table", class_="table").tbody
for tr in details.find_all("tr"):
data2.append([re.sub(r'\s ', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
# Any more?
ul = soup.find("ul", class_="pagination")
last_li = ul.find_all("li")[-1]
if last_li.text != "»":
break
# Merge the name and surname from the second table
data = []
for d1, d2 in zip(data1, data2):
data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)
df = pd.DataFrame(data)
print(df)
Giving you a dataframe starting:
Nombre Apellidos Sobre Dirección Población Provincia Teléfono E-mail Web
0 JUAN MARIANO MERCADO Juan Mariano Mercado Juan de Toledo, no 16, 1o B 30800 LORCA Murcia 968-471716 [email protected]
1 Ma. BELEN ABAD GARCIA Ma. Belen Abad Garcia Calle Constantino 33, 1o N 4700 EL EJIDO Almería 950487533 - 647936929 [email protected]
2 JESÚS ABAD MUÑIZ Jesús Abad Muñiz Santiago, 15, 1o.- ctro. 47001 Valladolid 98.320.20.11 [email protected]
3 Ma PALOMA ABAD TEJERINA Ma Paloma Abad Tejerina Poniente, 40 28036 Madrid 91.383.11.45 [email protected]
4 GEMA ÁBALOS MUÑOZ Gema ábalos Muñoz Solarillo de Gracia, 4, 1o.- D 18002 Granada 639.317.297 [email protected]
You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards