I have a script which scrapes a website for the name, region and province of companies in Spain. There is another link within the html, which takes you to a page that contains the phone number, but when I try to even scrape the html, it prints "none". Is there a way that the script can automatically move to the page, scrape the number and match it with the company row?
import requests
from googlesearch import search
from bs4 import BeautifulSoup
for page in range(1,65):
url = "https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{page}.html".format(page =page)
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the list
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.find('href', class_ ="col1")
info = [title, location, province, link]
print(info)
Alternatively, is there is a way to do it with googlesearch library?
Many thanks
CodePudding user response:
first url "https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"
not
"https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/1.html"
for this reason your script does not return output.
you can try like this
import requests
# from googlesearch import search
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,5)]
allurls = baseurl urls
print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the list
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
info = [title, location, province, link]
print(info)