I need to get code to loop through xml links on webpage and append data into 1 csv file-CodePudding

I have written code that extracts all the XML on the webpage: https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200 and puts them into a dataframe.

import typing
import urllib.request

import pandas
import pandas as pd
from bs4 import BeautifulSoup
from pandas import DataFrame

theurl = "https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200"
thepage = urllib.request.urlopen(theurl)

soup = BeautifulSoup(thepage)

project_href = [i['href'] for i in soup.find_all('a', href=True) if i['href'] != "#"]
print(project_href)

links = pd.DataFrame(project_href, columns=['Url'])

print(links.head(20))

n = 12
links.drop(index=links.index[:n], inplace=True)
b = 15

# Dropping last n rows using drop
links.drop(links.tail(b).index,
        inplace=True)

links.drop([409], axis=0, inplace=True)

Then I have written code that extracts the data from the top link into a csv file but need help getting the code to loop through each xml link and append the data into the csv file as there are over 300 links so manually would take a while

def get_feed(url):
    """Scrapes an XML feed from the provided URL and returns XML source.

        Args:
            url (string): Fully qualified URL pointing to XML feed.

        Returns:
            source (string): XML source of scraped feed.
        """

    try:
        response = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'Mozilla'}))
        source = BeautifulSoup(response, 'lxml-xml', from_encoding=response.info().get_param('charset'))
        return source
    except Exception as e:
        print('Error: '   str(e))


FEED_URL = links['Url'].iat[0]
xml = get_feed(FEED_URL)


def get_elements(xml, item='EstablishmentDetail'):
    try:
        items = xml.find_all(item)
        elements = [element.name for element in items[0].find_all()]
        return elements
    except Exception as e:
        print('Error: '   str(e))



def feed_to_df(url, item='EstablishmentDetail'):
    xml = get_feed(url)
    elements = get_elements(xml)

    if isinstance(elements, typing.List):
        df = pd.DataFrame(columns=elements)

        items = xml.find_all(item)

        for item in items:
            row = {}
            for element in elements:
                if xml.find(element):
                    if item.find(element):
                        row[element] = item.findNext(element).text
                    else:
                        row[element] = ''
                else:
                    row[element] = ''

            df = df.append(row, ignore_index=True)
        return df


df = feed_to_df(FEED_URL)

df.to_csv(f'C:/FDSA_3.csv', index=False)

any help on this would be appreciated thanks

CodePudding user response：

import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from pprint import pp

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}

allin = []


def get_soup(content):
    return BeautifulSoup(content, 'xml')


def worker(req, link):
    r = req.get(link)
    soup = get_soup(r.content).select_one('EstablishmentDetail')
    return {x.name: x.get_text(strip=True) for x in soup}


def result(item):
    res = item.result()
    pp(res)
    allin.append(res)


def main(url):
    with requests.Session() as req, ThreadPoolExecutor(max_workers=10) as executor:
        req.headers.update(headers)
        r = req.get(url)
        soup = get_soup(r.content)
        links = (i['href']
                 for i in soup.select('a.o-dataset-distribution--link:-soup-contains(xml)'))
        fs = (executor.submit(worker, req, link).add_done_callback(result)
              for link in links)
        for _ in as_completed(fs):
            pass
        df = pd.DataFrame(allin)
        print(df)


if __name__ == "__main__":
    main('https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200')

CodePudding user response：

Since XML files are fairly shallow, consider pandas.read_xml and use lxml (default underlying parser to BeautifulSoup) for simpler extraction of XML links:

import urllib.request as urq
import lxml.etree as lx
import pandas as pd

# RETRIEVE HTML PAGE
links_url = "https://data.food.gov.uk/catalog/datasets/38dd8d6a-5ab1-4f50-b753-ab33288e3200"
with urq.urlopen(links_url) as response:
    links_page = response.read()

# BUILD LIST OF DICTS OF PARSED URLS
doc = lx.HTML(links_page)
dds = doc.xpath(".//dd[@class='c-dataset-element__distribution-files']/strong/a")
links = [{"city": d.attrib["title"], "url": d.attrib["href"]} for d in dds]

# USER-DEFINED METHOD TO BUILD DATA FRAME
def build_frame(link):
    return (
        pd.read_xml(link["url"], xpath=".//EstablishmentDetail")
          .join(pd.read_xml(link["url"], xpath=".//Geocode"))
          .drop("Geocode", axis="columns")
          .assign(city=link["city"], link=link["url"])
    )

# BUILD LIST OF DATA FRAMES
food_dfs = [build_frame(link) for link in links]

# CONCATENATE TO SINGLE, MASTER DATA FRAME
food_df = pd.concat(food_dfs, ignore_index=True)

Output (first 5 links)

print(food_df)

       FHRSID LocalAuthorityBusinessID               BusinessName  ...   Latitude           city                                               link
0      982849                EHDC12463  1 & 30 DONALD DEWAR COURT  ...  57.161308  Aberdeen City  http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
1     1523164                EHDC15005     12 KEBAB & PIZZA HOUSE  ...  57.155210  Aberdeen City  http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
2      592387                EHDC10723     1906 RESTAURANT AT HMT  ...  57.148161  Aberdeen City  http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
3     1418061                EHDC13206                        1DS  ...  57.145692  Aberdeen City  http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
4      593681                 EHDC9793                 210 BISTRO  ...  57.142278  Aberdeen City  http://ratings.food.gov.uk/OpenDataFiles/FHRS7...
...       ...                      ...                        ...  ...        ...            ...                                                ...
7515      841               0004055/FH               Wudus Chippy  ...  53.000869   Amber Valley  http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7516   430138            12/00059/FOOD     Yarningdale Healthcare  ...  53.039253   Amber Valley  http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7517  1537300               0007044/HS           Yasmine's Bakery  ...  53.014123   Amber Valley  http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7518   928824               0010280/HS      Yum Yum Sweet Company  ...  53.049512   Amber Valley  http://ratings.food.gov.uk/OpenDataFiles/FHRS0...
7519   853573               0000069/FH      Zion Methodist Church  ...  53.092867   Amber Valley  http://ratings.food.gov.uk/OpenDataFiles/FHRS0...