I have run into a problem and I can´t figure out how to get any further.
I have scraped multiple pages for a companies name, location and province, along with a link to additional information on another page. The link which I have collected provides 3 more pieces of information that I require.
I need to access the link, and take out the address, phone number (if it has one) and a CNAE code, and append that to the previous data.
The working script for the first scrape I currently have is as follows:
import requests
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,65)]
allurls = baseurl urls
print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the pages
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
info = [title, location, province, link]
print(info)
On the second page the data is in a table with the id names below. This is the code I thought I would need to use but it isn´t working and I am going round in circles trying to figure out why:
section = soup.select("section#datos_empresa")
lslinks = link
for ls in lslinks
location = lis.find('tr', id_="tamano_empresa").text
cnae = lis.find('tr', id_="cnae_codigo_empresa").text
phone = lis.find('tr', id_="telefono_empresa").text
addinfo = [location, cnae, phone]
info.append(addinfo)
Here´s an example of one of the links
Ideally the output would be:
['AGRICOLA CALLEJA SL', 'CARPIO', 'VALLADOLID', 'https://www.expansion.com/directorio-empresas/agricola-calleja-sl_1480101_A02_47.html', C/ LA TORRE, 2., 150, 983863247]
which I would write to a text file so I can import it to excel.
Any help would be greatly appreciated!
Cheers!
CodePudding user response:
In your sub page, you were trying to select the ID not the class for the section so it was failing to match any entries. You could also use the td
.
Your logic for the sub page needs to be combined with your main page. Try the following:
import requests
from bs4 import BeautifulSoup
import csv
with open('output.csv', 'w', newline='', encoding='utf-8') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(["Title", "Location", "Province", "Link", "Location", "cnae", "Phone"])
urls = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls.extend(f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html' for i in range(2, 65))
for url in urls:
print(url)
r_main = requests.get(url)
soup_main = BeautifulSoup(r_main.content, "html.parser")
for lis in soup_main.select("div#simulacion_tabla ul"):
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
print(' ', link)
r_sub = requests.get(link)
soup_sub = BeautifulSoup(r_sub.content, "html.parser")
section = soup_sub.select_one("section.datos_empresa")
location = section.find('td', id="tamano_empresa").text
cnae = section.find('td', id="cnae_codigo_empresa").text
phone = section.find('td', id="telefono_empresa").text
csv_output.writerow([title, location, province, link, location, cnae, phone])
This will create a CSV output file starting:
Title,Location,Province,Link,Location,cnae,Phone
A CORTIÑA DOS ACIVROS SL,DESCONOCIDO,LUGO,https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html,DESCONOCIDO,150,
A CORTIÑA DOS ACIVROS SL,DESCONOCIDO,LUGO,https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html,DESCONOCIDO,150,
A P V 19 32 SL,MICROEMPRESA,VALENCIA,https://www.expansion.com/directorio-empresas/a-p-v-19-32-sl_672893_A02_46.html,MICROEMPRESA,150,
ABADIA DE JABUGO SL,DESCONOCIDO,HUELVA,https://www.expansion.com/directorio-empresas/abadia-de-jabugo-sl_5442689_A02_21.html,DESCONOCIDO,150,
ABALOS REAL SLL,MICROEMPRESA,CUENCA,https://www.expansion.com/directorio-empresas/abalos-real-sll_1239004_A02_16.html,MICROEMPRESA,150,969142092
CodePudding user response:
Here is the minimal working solution so far.
Code:
import requests
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,65)]
allurls = baseurl urls
#print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the pages
for lis in lists:
title = lis.find('li', class_="col1").te___xt
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select_one("li.col1 a")['href']
#info = [title, location, province, link]
#print(info)
sub_page = requests.get(link)
soup2 = BeautifulSoup(sub_page.content, "html.parser")
direction = soup2.select_one('#direccion_empresa').text
cnae = soup2.select_one('#cnae_codigo_empresa').text
phone=soup2.select_one('#telefono_empresa')
telephoe = phone.text if phone else None
print([title,location,province,link,direction,cnae,telephoe])
Output:
['A CORTIÑA DOS ACIVROS SL', 'LUGO', 'LUGO', 'https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html', 'CRTA. A CORUÑA, 16.', '150', '']
['A CORTIÑA DOS ACIVROS SL', 'LUGO', 'LUGO', 'https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html', 'CRTA. A CORUÑA, 16.', '150', '']
['A P V 19 32 SL', 'VALENCIA', 'VALENCIA', 'https://www.expansion.com/directorio-empresas/a-p-v-19-32-sl_672893_A02_46.html', 'CALLE SALVA, 8 1 2B.', '150', '']
['ABADIA DE JABUGO SL', 'CARTAYA', 'HUELVA', 'https://www.expansion.com/directorio-empresas/abadia-de-jabugo-sl_5442689_A02_21.html', 'URB. MARINA EL ROMPIDO, 31 VILLA M-31. CRTA. EL RO.', '150', '']
['ABALOS REAL SLL', 'CARBONERAS DE GUADAZAON', 'CUENCA', 'https://www.expansion.com/directorio-empresas/abalos-real-sll_1239004_A02_16.html', 'C/ DON CRUZ, 23.', '150', '969142092']
... so on