I am trying to download the zip, which we can also download by clicking on "SCARICA I DATI CSV" on this webpage. I want to do this for 7000 Italian municipalities by using beautiful soup.
Right now, I have the following code for one city/municipality:
city_name = "vandoies-vintl"
prov_name = "bz"
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' city_name "-comune-" prov_name).read()
soup = BeautifulSoup(r, 'lxml')
# this is where the code breaks. because the HTML body does not have any mention of "csv" whatsoever, which is weird.
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" csvlink, city_name ".zip")
I can't find any mention of csv when checking using print(soup)
. Could someone please help? Thanks!
The following code works.
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request
import re
import os
import urllib
import zipfile
import re
output_path = r"/Users/aartimalik/Dropbox/delphine-miscellaneous/italy/test"
munis = [("monale", "at"), ("portacomaro", "at")]
munis = pd.DataFrame(munis)
munis.columns = ['municipality_clean','prov_abb']
def remove_paren(string):
return re.sub(r'\(.*\)', '', str(string))
munis['municipality_clean']= munis['municipality_clean'].apply(lambda x: remove_paren(x))
munis['municipality_clean'] = munis['municipality_clean'].str.strip()
munis = munis.replace(' ', '-', regex=True)
munis = munis.apply(lambda x: x.str.lower())
for i in range(0,len(munis)):
city_name = munis.iloc[i]['municipality_clean']
prov_name = munis.iloc[i]['prov_abb']
try:
r = urllib.request.urlopen('http://storico.openbilanci.it/bilanci/' city_name "-comune-" prov_name).read()
soup = BeautifulSoup(r, 'lxml')
csv = soup.find_all('a', attrs={'class':'pull-right csv'})
try:
csvlink = csv[0]['href']
urllib.request.urlretrieve("http://storico.openbilanci.it" csvlink, city_name ".zip")
#print('Downloaded and extracted zip for ' city_name ', ' prov_name)
print(str(i) ". " city_name ": success")
scrapesuccess = scrapesuccess.append(munis.iloc[i])
newfolder= output_path "/" city_name.capitalize()
if not os.path.exists(newfolder):
os.makedirs(newfolder)
zip_ref = zipfile.ZipFile(output_path "/" city_name ".zip", 'r')
zip_ref.extractall(newfolder)
zip_ref.close()
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) ". " city_name ": fail")
except:
scrapefail = scrapefail.append(munis.iloc[i])
print(str(i) ". " city_name ": fail")
CodePudding user response:
Heres an example of downloading the zip in memory and writing a city directory with all the csv files.
import urllib.request as request
from io import StringIO
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, **kwargs):
self.url_root = "http://storico.openbilanci.it"
self.city_name = kwargs.get("city_name")
self.prov_name = kwargs.get("prov_name")
def main(self) -> None:
file_link = self.get_link()
zipped_file = self.download_file(file_link)
unzipped_files_mapping = self.unzip_file(zipped_file)
self.write_files(unzipped_files_mapping)
def get_link(self) -> str:
url = f"{self.url_root}/bilanci/{self.city_name}-comune-{self.prov_name}"
response = request.urlopen(url).read()
soup = BeautifulSoup(response, "lxml")
return soup.find_all("a", attrs={"class": "pull-right csv"})[0]["href"]
def download_file(self, zip_link: str) -> str:
url = f"{self.url_root}{zip_link}"
return request.urlretrieve(url)[0]
@staticmethod
def unzip_file(file_handle: str) -> dict:
zip_file_object = ZipFile(file_handle, "r")
files = zip_file_object.namelist()
return {
file.rsplit("-", 1)[1]: pd.read_csv(StringIO(zip_file_object.open(file).read().decode("utf-8")), sep=";")
for file in files
}
def write_files(self, file_mapping: dict) -> None:
for file_name, df in file_mapping.items():
path = Path(f"/path/to/files/{self.city_name}")
path.mkdir(parents=True, exist_ok=True)
df.to_csv(f"{path}/{file_name}")
city_name = "vandoies-vintl"
prov_name = "bz"
Scraper(city_name=city_name, prov_name=prov_name).main()