Home > OS >  Scrape data using beautifulsoup
Scrape data using beautifulsoup

Time:06-26

I am extracting the data they give repeat name and surname in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php

import requests
import pandas as pd
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}

temp = []
wev={}
for page in range(1, 5):
    r = requests.get(
        "https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
            page=page
        ),
        headers=headers,
    )
    soup = BeautifulSoup(r.content, "lxml")
    details=soup.find('table',class_="table")
 
    for detail in details.find_all('tbody'):
       
        link = [up.text for up in detail.find_all("td")]
        name=link[0]
        wev['Nombre']=name
        surname=link[1]
        wev["Apellidos"]=surname
        
  
    tag = soup.find_all("div", class_="col-md-8 col-sm-8")
    for pro in tag:
        data = [tup.text for tup in pro.find_all("p")]
        
         
             
        Dirección = data[2]
        Dirección = Dirección[12:]
        wev[" Dirección"]= Dirección

        Población = data[3]
        Población = Población[14:]
        wev[" Población"]= Población

        Provincia = data[4]
        Provincia = Provincia[14:]
        wev["Provincia "]=Provincia 

        Teléfono = data[5]
        Teléfono = " "   Teléfono[11:].replace(".", "")
        Teléfono=  Teléfono.replace("-", '')
        wev[" Teléfono"]= Teléfono



        Email = data[6]
        Email = Email[10:]
        wev["Email"]=  Email
        
        temp.append(wev)

df = pd.DataFrame(temp)
print(df)

They will print same name and surname in each entry how I correct it these is output

  Nombre          Apellidos                                                                           
0     JUAN  ARIAS   BARTOLOMÉ     
1     JUAN  ARIAS   BARTOLOM

CodePudding user response:

One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:

import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}

page = 1
data1 = []
data2 = []

while True:
    print(f"Page {page}")
    r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
    page  = 1
    
    soup = BeautifulSoup(r.content, "lxml")
    
    for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
        values = [re.sub(r'\s ', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
        row = {'Sobre' : values[0][6:]}     # skip over the word Sobre
        
        for item in values[2:]:
            key, value = item.split(':', 1)
            row[key.strip()] = value.strip()
        
        row['Teléfono'] = row['Teléfono'].replace(".", "")
        data1.append(row)

    details = soup.find("table", class_="table").tbody
    
    for tr in details.find_all("tr"):
        data2.append([re.sub(r'\s ', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
        
    # Any more?
    ul = soup.find("ul", class_="pagination")
    last_li = ul.find_all("li")[-1]
    
    if last_li.text != "»":
        break

# Merge the name and surname from the second table
data = []

for d1, d2 in zip(data1, data2):
    data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)

df = pd.DataFrame(data)
print(df)

Giving you a dataframe starting:

              Nombre                        Apellidos                                      Sobre                                                 Dirección                         Población    Provincia                 Teléfono                                 E-mail                                          Web
0       JUAN MARIANO                          MERCADO                       Juan Mariano Mercado                                Juan de Toledo, no 16, 1o B                        30800 LORCA       Murcia                968-471716                 [email protected]                                             
1          Ma. BELEN                      ABAD GARCIA                      Ma. Belen Abad Garcia                                 Calle Constantino 33, 1o N                      4700 EL EJIDO     Almería     950487533 - 647936929       [email protected]                                             
2             JESÚS                      ABAD MUÑIZ                         Jesús Abad Muñiz                                   Santiago, 15, 1o.- ctro.                              47001   Valladolid              98.320.20.11         [email protected]                                             
3          Ma PALOMA                    ABAD TEJERINA                    Ma Paloma Abad Tejerina                                               Poniente, 40                              28036       Madrid              91.383.11.45            [email protected]                                             
4               GEMA                   ÁBALOS MUÑOZ                        Gema ábalos Muñoz                             Solarillo de Gracia, 4, 1o.- D                              18002      Granada               639.317.297                          [email protected]

You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards

  • Related