Python iterparse large XML while filtering with elements and children-CodePudding

I am attempting to parse product data from icecat. The data comes in large xml files. (3-7gb).

In order to reduce the amount of product data I am bringing in, I need to filter this list before moving to my next step. Particularly I need to filter by the "Updated" and "On_Market" values in the "file" element, and IF the "Country_Markets" child exists, I need to see if that (possible) list of children contains <"Country_Market = "US">

I am able to get the On_Market filter to work, but can't figure out how to structure the check of the Country_Markets child, or implement the date filter.

Where I am currently:

PYTHON:

from xml.etree.ElementTree import iterparse

file_path = 'index.xml'
dict_list = []
date = "20220803143328"

for _, elem in iterparse(file_path, events=("end",)):
    for child in elem:
        print(child)
        if child.tag == 'Country_Markets':
            if child.attrib['Country_Market'] == "US" OR child['Country_Markets'] is None:
                if elem.tag == "file":
                    if elem.attrib['On_Market'] == "1":
                        if elem.attrib['Updated']>= date: 
                            dict_list.append({'IceId': elem.attrib['Product_ID'],
                                            'LastUpdate': elem.attrib['Updated'],
                                            'PartNum': elem.attrib['Prod_ID'],
                                            'OnMarket': elem.attrib['On_Market']})

        elem.clear()


df = pd.DataFrame(dict_list)

EDIT NEW PYTHON APPROACH

I am trying to implement a different package which seems to do the trick, except I haven't been able to figure out how to incorporate the EAN_UPCS into my output...

from lxml import etree

context = etree.iterparse(file_path,  events=("start", "end"),)
for event, elem in context:
    if elem.tag == 'file':
        for child1 in elem:
            if child1.tag == 'Country_Markets':
                for child2 in child1:
                    if child2.attrib['Value'] == "US":
                        if elem.attrib['On_Market'] == "1":
                            if elem.attrib['Updated']>= "20220803143328":
                                print(f"'IceId': {elem.attrib['Product_ID']}")
                                print(f"'LastUpdate': {elem.attrib['Updated']}")
                                print(f"'PartNum': {elem.attrib['Prod_ID']}")
                                print(f"'OnMarket': {elem.attrib['On_Market']}")

                                if child1.tag == 'EAN_UPCS':
                                    for child2 in child1:
                                        if child2.attrib['IsApproved']  == "1":
                                            print(child2.attrib['Value'])

EDIT 2:

I can get the result I am after if I run segments of the loop, but once I run the entire loop, I either lose information, or append all information.

dict_list = []

context = etree.iterparse(file_path,  events=("end",))
for event, elem in context:
    if elem.tag == 'file':
        id = elem.attrib['Product_ID']
        print(id)
        valid = "no"
        dist = ()
        last_date = ()
        market = ()
        upc_list = []
        if elem.attrib['On_Market'] == "1":
            market = "yes"
        if elem.attrib['Updated']>= "20170803143328":
            last_date = "yes"
        for child1 in elem:
            if child1.tag =='Country_Markets': 
                print('markets found')
                for child2 in child1:
                    if child2.attrib['Value'] == "US":
                        dist = "yes"
                        
                    else:
                        if dist != "yes":
                            dist = "no"
                    #had to come up with a way to not overwrite the distibution while iterating throug elements        
            elif elem.find("Country_Markets") is None:
                print("No Markets")
                dist = "yes"
         
            if child1.tag == 'EAN_UPCS':
                    for child2 in child1:
                        if child2.attrib['IsApproved']  == "1":
                            upc_list.append(child2.attrib['Value'])
                   
            
        print(id, dist, last_date, market, upc_list)     
        if dist == "yes" and  last_date == "yes" and market == "yes":   
            dict_list.append({elem.attrib['Product_ID']:{'PartNum':elem.attrib['Prod_ID'], 'Updated' : elem.attrib['Updated'], 'UPCs': upc_list}} )
        continue
    elem.clear()
del context

dict_list

XML:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ICECAT-interface SYSTEM "https://data.icecat.biz/dtd/files.index.dtd">
<!--source: Icecat.biz 2022-->
<ICECAT-interface xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://data.icecat.biz/xsd/files.index.xsd">
    <files.index Generated="20220930000002">
        <file path="export/level4/US/1402.xml" Limited="No" HighPic="https://images.icecat.biz/img/gallery/1402_9631004284.jpg" HighPicSize="744457" HighPicWidth="2670" HighPicHeight="1407" Product_ID="1402" Updated="20220212085227" Quality="ICECAT" Prod_ID="C4893A" Supplier_id="1" Catid="377" On_Market="1" Model_Name="80 Value Pack 350-ml Yellow DesignJet Ink Cartridge and Printhead" Product_View="92380" Date_Added="20051028000000">
            <Country_Markets>
                <Country_Market Value="BE"/>
                <Country_Market Value="FR"/>
                <Country_Market Value="US"/>
                <Country_Market Value="GB"/>
                <Country_Market Value="DE"/>
                <Country_Market Value="CH"/>
                <Country_Market Value="IT"/>
                <Country_Market Value="CA"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/1414.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1414-HP.jpg" HighPicSize="43288" HighPicWidth="400" HighPicHeight="400" Product_ID="1414" Updated="20220711134129" Quality="ICECAT" Prod_ID="C6614NE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="C6614NE" Product_View="98879" Date_Added="20051023000000">
        </file>
        <file path="export/level4/US/1415.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1415-HP.jpg" HighPicSize="43235" HighPicWidth="400" HighPicHeight="400" Product_ID="1415" Updated="20190404035203" Quality="ICECAT" Prod_ID="51650CE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="50 Cyan Inkjet Print Cartridge" Product_View="60706" Date_Added="20051023000000">
            <EAN_UPCS>
                <EAN_UPC Value="0088698200223" IsApproved="0" Format="GTIN-13"/>
                <EAN_UPC Value="088698200223" IsApproved="0" Format="GTIN-12"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="BE"/>
                <Country_Market Value="DE"/>
                <Country_Market Value="IT"/>
                <Country_Market Value="UA"/>
                <Country_Market Value="DZ"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/7966778.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="814349" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966778" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC2G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="7328" Date_Added="20110223000000">
            <EAN_UPCS>
                <EAN_UPC Value="4713435791172" IsApproved="1" Format="GTIN-13"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="US"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/7966779.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="793195" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966779" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="6515" Date_Added="20110223000000">
            <EAN_UPCS>
                <EAN_UPC Value="4713435791714" IsApproved="1" Format="GTIN-13"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="LU"/>
                <Country_Market Value="CH"/>
            </Country_Markets>
        </file>
        <file path="export/level4/US/7966780.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966780-2331.jpg" HighPicSize="724700" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966780" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-2X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 8GB (4GB x 2)" Product_View="6902" Date_Added="20110223000000">
            <EAN_UPCS>
                <EAN_UPC Value="4713435791707" IsApproved="1" Format="GTIN-13"/>
            </EAN_UPCS>
            <Country_Markets>
                <Country_Market Value="LU"/>
                <Country_Market Value="CH"/>
            </Country_Markets>
        </file>
    </files.index>
</ICECAT-interface>

The goal is to be able to filter out any items that are older than a certain date, not on market, and explicitly not for sale in the US. (Items that don't have Country_Markets elements are to be included in the output.)

CodePudding user response：

Consider this adjusted iterparse approach without the inner loops across child elements. Below uses flag variables to be turned on and off conditionally while walking down the tree. A dictionary of data is built and iteratively appended within <file>...</file> context.

from lxml import etree 
#from xml.etree import ElementTree as etree

file_path = "Input.xml"
files_data = []

context = etree.iterparse(file_path,  events=("start", "end"),)
for event, elem in context:
    if event == "start":
        if elem.tag == 'file':
            data = {}
            US_Flag = 0
            Date_Flag = 0
            if elem.attrib['Updated'] >= "20220803143328":
                Date_Flag = 1
                data['IceId'] = elem.attrib['Product_ID']
                data['LastUpdate'] = elem.attrib['Updated']
                data['PartNum'] = elem.attrib['Prod_ID']
                data['OnMarket'] = elem.attrib['On_Market']
                data['EAN_UPCS'] = []

        if elem.tag == "EAN_UPC" and Date_Flag == 1:
            if elem.attrib['IsApproved'] == "1":
                data['EAN_UPCS'].append(elem.attrib['Value'])

        if elem.tag == "Country_Market" and Date_Flag == 1:
            data['Country_Market'] = "US"
            if elem.attrib["Value"] == "US":
                US_Flag = 1

    if event == "end":
        if elem.tag == "file" and (US_Flag == 1 or "Country_Market" not in data):
            files_data.append(data)

Output

For demonstration, below results remove >= "20220803143328" condition since posted XML does not contain dates in that range

from pprint import pprint

pprint(files_data)
[{'Country_Market': 'US',
  'EAN_UPCS': [],
  'IceId': '1402',
  'LastUpdate': '20220212085227',
  'OnMarket': '1',
  'PartNum': 'C4893A'},
 {'EAN_UPCS': [],
  'IceId': '1414',
  'LastUpdate': '20220711134129',
  'OnMarket': '1',
  'PartNum': 'C6614NE'},
 {'Country_Market': 'US',
  'EAN_UPCS': ['4713435791172'],
  'IceId': '7966778',
  'LastUpdate': '20201106094740',
  'OnMarket': '1',
  'PartNum': 'AX3U1600XC2G79-3X'}]

CodePudding user response：

Consider iterparsing the entire XML and then filter afterwards to avoid the logic checks while traversing the varying designs of the <file> elements. One way can be with your use of pandas.

Specifically, try the new large XML support (which uses iterparse with either lxml and etree parsers) introduced in current v1.5 with the relatively new pandas.read_xml IO method. See docs showing at end the parsing of Wikipedia's 12 GB XML dump in minutes (tested on a 8 GB RAM laptop)!

Also in v1.5, read_xml now supports dtype handling like other IO modules (i.e., read_csv, read_json) such as converting the Updated attribute value to datetime. You can even rename columns accordingly:

import pandas as pd

products_df = pd.read_xml(
    "Input.xml",
    iterparse = {"file": ["Product_ID", "Updated", "Prod_ID", "On_Market"]},
    names = ["IceID", "LastUpdate", "PartNum", "OnMarket"],
    parse_dates = ["LastUpdate"]
)

print(products_df)
#      IceID          LastUpdate            PartNum  OnMarket
# 0     1402 2022-02-12 08:52:27             C4893A         1
# 1     1414 2022-07-11 13:41:29            C6614NE         1
# 2     1415 2019-04-04 03:52:03            51650CE         1
# 3  7966778 2020-11-06 09:47:40  AX3U1600XC2G79-3X         1
# 4  7966779 2020-11-06 09:47:40  AX3U1600XC4G79-3X         1
# 5  7966780 2020-11-06 09:47:40  AX3U1600XC4G79-2X         1

print(products_df.dtypes)
# IceID                  int64
# LastUpdate    datetime64[ns]
# PartNum               object
# OnMarket               int64
# dtype: object

Since the @value attribute is used multiple times for EAN and Country_Market elements, you need to repeat with generalized renaming. Below parses the first 15 @value elements under <file> element. Notice, read_xml drops all missing value columns since only max of 8 @value is ever used!

products_df = pd.read_xml(
    "Input.xml",
    iterparse={
        "file": [
            "Product_ID", "Updated", "Prod_ID", "On_Market",
            "Value", "Value", "Value", "Value", "Value",
            "Value", "Value", "Value", "Value", "Value",
            "Value", "Value", "Value", "Value", "Value"
        ]
    },
    names = [
        "IceID", "LastUpdate", "PartNum", "OnMarket", 
        "Value_1", "Value_2", "Value_3", "Value_4", "Value_5",
        "Value_6", "Value_7", "Value_8", "Value_9", "Value_10",        
        "Value_11", "Value_12", "Value_13", "Value_14", "Value_15",
    ],
    parse_dates = ["LastUpdate"]
)

print(products_df)
#      IceID          LastUpdate            PartNum  OnMarket        Value_1       Value_2 Value_3 Value_4 Value_5 Value_6 Value_7 Value_8
# 0     1402 2022-02-12 08:52:27             C4893A         1             BE            FR      US      GB      DE      CH      IT      CA
# 1     1414 2022-07-11 13:41:29            C6614NE         1           None          None    None    None    None    None    None    None
# 2     1415 2019-04-04 03:52:03            51650CE         1  0088698200223  088698200223      BE      DE      IT      UA      DZ    None
# 3  7966778 2020-11-06 09:47:40  AX3U1600XC2G79-3X         1  4713435791172            US    None    None    None    None    None    None
# 4  7966779 2020-11-06 09:47:40  AX3U1600XC4G79-3X         1  4713435791714            LU      CH    None    None    None    None    None
# 5  7966780 2020-11-06 09:47:40  AX3U1600XC4G79-2X         1  4713435791707            LU      CH    None    None    None    None    None

print(products_df.dtypes)
# IceID                  int64
# LastUpdate    datetime64[ns]
# PartNum               object
# OnMarket               int64
# Value_1               object
# Value_2               object
# Value_3               object
# Value_4               object
# Value_5               object
# Value_6               object
# Value_7               object
# Value_8               object
# dtype: object

From there, consider various pandas methods (DataFrame.loc, DataFrame.query, etc.) to filter data by LastUpdate date, OnMarket values, and any 'US' or all empty two-character fields. Possibly reshape to long with DataFrame.melt to separate and capture EAN values.