I have written code that extracts all the XML on the webpage: https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200 and puts them into a dataframe.
import typing
import urllib.request
import pandas
import pandas as pd
from bs4 import BeautifulSoup
from pandas import DataFrame
theurl = "https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage)
project_href = [i['href'] for i in soup.find_all('a', href=True) if i['href'] != "#"]
print(project_href)
links = pd.DataFrame(project_href, columns=['Url'])
print(links.head(20))
n = 12
links.drop(index=links.index[:n], inplace=True)
b = 15
# Dropping last n rows using drop
links.drop(links.tail(b).index,
inplace=True)
links.drop([409], axis=0, inplace=True)
Then I have written code that extracts the data from the top link into a csv file but need help getting the code to loop through each xml link and append the data into the csv file as there are over 300 links so manually would take a while
def get_feed(url):
"""Scrapes an XML feed from the provided URL and returns XML source.
Args:
url (string): Fully qualified URL pointing to XML feed.
Returns:
source (string): XML source of scraped feed.
"""
try:
response = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'Mozilla'}))
source = BeautifulSoup(response, 'lxml-xml', from_encoding=response.info().get_param('charset'))
return source
except Exception as e:
print('Error: ' str(e))
FEED_URL = links['Url'].iat[0]
xml = get_feed(FEED_URL)
def get_elements(xml, item='EstablishmentDetail'):
try:
items = xml.find_all(item)
elements = [element.name for element in items[0].find_all()]
return elements
except Exception as e:
print('Error: ' str(e))
def feed_to_df(url, item='EstablishmentDetail'):
xml = get_feed(url)
elements = get_elements(xml)
if isinstance(elements, typing.List):
df = pd.DataFrame(columns=elements)
items = xml.find_all(item)
for item in items:
row = {}
for element in elements:
if xml.find(element):
if item.find(element):
row[element] = item.findNext(element).text
else:
row[element] = ''
else:
row[element] = ''
df = df.append(row, ignore_index=True)
return df
df = feed_to_df(FEED_URL)
df.to_csv(f'C:/FDSA_3.csv', index=False)
any help on this would be appreciated thanks
CodePudding user response:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
allin = []
def get_soup(content):
return BeautifulSoup(content, 'xml')
def worker(req, link):
r = req.get(link)
soup = get_soup(r.content).select_one('EstablishmentDetail')
return {x.name: x.get_text(strip=True) for x in soup}
def result(item):
res = item.result()
pp(res)
allin.append(res)
def main(url):
with requests.Session() as req, ThreadPoolExecutor(max_workers=10) as executor:
req.headers.update(headers)
r = req.get(url)
soup = get_soup(r.content)
links = (i['href']
for i in soup.select('a.o-dataset-distribution--link:-soup-contains(xml)'))
fs = (executor.submit(worker, req, link).add_done_callback(result)
for link in links)
for _ in as_completed(fs):
pass
df = pd.DataFrame(allin)
print(df)
if __name__ == "__main__":
main('https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200')
CodePudding user response:
Since XML files are fairly shallow, consider pandas.read_xml
and use lxml
(default underlying parser to BeautifulSoup) for simpler extraction of XML links:
import urllib.request as urq
import lxml.etree as lx
import pandas as pd
# RETRIEVE HTML PAGE
links_url = "https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200"
with urq.urlopen(links_url) as response:
links_page = response.read()
# BUILD LIST OF DICTS OF PARSED URLS
doc = lx.HTML(links_page)
dds = doc.xpath(".//dd[@class='c-dataset-element__distribution-files']/strong/a")
links = [{"city": d.attrib["title"], "url": d.attrib["href"]} for d in dds]
# USER-DEFINED METHOD TO BUILD DATA FRAME
def build_frame(link):
return (
pd.read_xml(link["url"], xpath=".//EstablishmentDetail")
.join(pd.read_xml(link["url"], xpath=".//Geocode"))
.drop("Geocode", axis="columns")
.assign(city=link["city"], link=link["url"])
)
# BUILD LIST OF DATA FRAMES
food_dfs = [build_frame(link) for link in links]
# CONCATENATE TO SINGLE, MASTER DATA FRAME
food_df = pd.concat(food_dfs, ignore_index=True)
Output (first 5 links)
print(food_df)
FHRSID LocalAuthorityBusinessID BusinessName ... Latitude city link
0 982849 EHDC12463 1 & 30 DONALD DEWAR COURT ... 57.161308 Aberdeen City http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
1 1523164 EHDC15005 12 KEBAB & PIZZA HOUSE ... 57.155210 Aberdeen City http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
2 592387 EHDC10723 1906 RESTAURANT AT HMT ... 57.148161 Aberdeen City http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
3 1418061 EHDC13206 1DS ... 57.145692 Aberdeen City http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
4 593681 EHDC9793 210 BISTRO ... 57.142278 Aberdeen City http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
... ... ... ... ... ... ... ...
7515 841 0004055/FH Wudus Chippy ... 53.000869 Amber Valley http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7516 430138 12/00059/FOOD Yarningdale Healthcare ... 53.039253 Amber Valley http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7517 1537300 0007044/HS Yasmine's Bakery ... 53.014123 Amber Valley http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7518 928824 0010280/HS Yum Yum Sweet Company ... 53.049512 Amber Valley http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7519 853573 0000069/FH Zion Methodist Church ... 53.092867 Amber Valley http://ratings.food.gov.uk/OpenDataFiles/FHRS0...