I am attempting to parse product data from icecat. The data comes in large xml files. (3-7gb).
In order to reduce the amount of product data I am bringing in, I need to filter this list before moving to my next step. Particularly I need to filter by the "Updated" and "On_Market" values in the "file" element, and IF the "Country_Markets" child exists, I need to see if that (possible) list of children contains <"Country_Market = "US">
I am able to get the On_Market filter to work, but can't figure out how to structure the check of the Country_Markets child, or implement the date filter.
Where I am currently:
PYTHON:
from xml.etree.ElementTree import iterparse
file_path = 'index.xml'
dict_list = []
date = "20220803143328"
for _, elem in iterparse(file_path, events=("end",)):
for child in elem:
print(child)
if child.tag == 'Country_Markets':
if child.attrib['Country_Market'] == "US" OR child['Country_Markets'] is None:
if elem.tag == "file":
if elem.attrib['On_Market'] == "1":
if elem.attrib['Updated']>= date:
dict_list.append({'IceId': elem.attrib['Product_ID'],
'LastUpdate': elem.attrib['Updated'],
'PartNum': elem.attrib['Prod_ID'],
'OnMarket': elem.attrib['On_Market']})
elem.clear()
df = pd.DataFrame(dict_list)
EDIT NEW PYTHON APPROACH
I am trying to implement a different package which seems to do the trick, except I haven't been able to figure out how to incorporate the EAN_UPCS into my output...
from lxml import etree
context = etree.iterparse(file_path, events=("start", "end"),)
for event, elem in context:
if elem.tag == 'file':
for child1 in elem:
if child1.tag == 'Country_Markets':
for child2 in child1:
if child2.attrib['Value'] == "US":
if elem.attrib['On_Market'] == "1":
if elem.attrib['Updated']>= "20220803143328":
print(f"'IceId': {elem.attrib['Product_ID']}")
print(f"'LastUpdate': {elem.attrib['Updated']}")
print(f"'PartNum': {elem.attrib['Prod_ID']}")
print(f"'OnMarket': {elem.attrib['On_Market']}")
if child1.tag == 'EAN_UPCS':
for child2 in child1:
if child2.attrib['IsApproved'] == "1":
print(child2.attrib['Value'])
EDIT 2:
I can get the result I am after if I run segments of the loop, but once I run the entire loop, I either lose information, or append all information.
dict_list = []
context = etree.iterparse(file_path, events=("end",))
for event, elem in context:
if elem.tag == 'file':
id = elem.attrib['Product_ID']
print(id)
valid = "no"
dist = ()
last_date = ()
market = ()
upc_list = []
if elem.attrib['On_Market'] == "1":
market = "yes"
if elem.attrib['Updated']>= "20170803143328":
last_date = "yes"
for child1 in elem:
if child1.tag =='Country_Markets':
print('markets found')
for child2 in child1:
if child2.attrib['Value'] == "US":
dist = "yes"
else:
if dist != "yes":
dist = "no"
#had to come up with a way to not overwrite the distibution while iterating throug elements
elif elem.find("Country_Markets") is None:
print("No Markets")
dist = "yes"
if child1.tag == 'EAN_UPCS':
for child2 in child1:
if child2.attrib['IsApproved'] == "1":
upc_list.append(child2.attrib['Value'])
print(id, dist, last_date, market, upc_list)
if dist == "yes" and last_date == "yes" and market == "yes":
dict_list.append({elem.attrib['Product_ID']:{'PartNum':elem.attrib['Prod_ID'], 'Updated' : elem.attrib['Updated'], 'UPCs': upc_list}} )
continue
elem.clear()
del context
dict_list
XML:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ICECAT-interface SYSTEM "https://data.icecat.biz/dtd/files.index.dtd">
<!--source: Icecat.biz 2022-->
<ICECAT-interface xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://data.icecat.biz/xsd/files.index.xsd">
<files.index Generated="20220930000002">
<file path="export/level4/US/1402.xml" Limited="No" HighPic="https://images.icecat.biz/img/gallery/1402_9631004284.jpg" HighPicSize="744457" HighPicWidth="2670" HighPicHeight="1407" Product_ID="1402" Updated="20220212085227" Quality="ICECAT" Prod_ID="C4893A" Supplier_id="1" Catid="377" On_Market="1" Model_Name="80 Value Pack 350-ml Yellow DesignJet Ink Cartridge and Printhead" Product_View="92380" Date_Added="20051028000000">
<Country_Markets>
<Country_Market Value="BE"/>
<Country_Market Value="FR"/>
<Country_Market Value="US"/>
<Country_Market Value="GB"/>
<Country_Market Value="DE"/>
<Country_Market Value="CH"/>
<Country_Market Value="IT"/>
<Country_Market Value="CA"/>
</Country_Markets>
</file>
<file path="export/level4/US/1414.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1414-HP.jpg" HighPicSize="43288" HighPicWidth="400" HighPicHeight="400" Product_ID="1414" Updated="20220711134129" Quality="ICECAT" Prod_ID="C6614NE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="C6614NE" Product_View="98879" Date_Added="20051023000000">
</file>
<file path="export/level4/US/1415.xml" Limited="No" HighPic="https://images.icecat.biz/img/norm/high/1415-HP.jpg" HighPicSize="43235" HighPicWidth="400" HighPicHeight="400" Product_ID="1415" Updated="20190404035203" Quality="ICECAT" Prod_ID="51650CE" Supplier_id="1" Catid="377" On_Market="1" Model_Name="50 Cyan Inkjet Print Cartridge" Product_View="60706" Date_Added="20051023000000">
<EAN_UPCS>
<EAN_UPC Value="0088698200223" IsApproved="0" Format="GTIN-13"/>
<EAN_UPC Value="088698200223" IsApproved="0" Format="GTIN-12"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="BE"/>
<Country_Market Value="DE"/>
<Country_Market Value="IT"/>
<Country_Market Value="UA"/>
<Country_Market Value="DZ"/>
</Country_Markets>
</file>
<file path="export/level4/US/7966778.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="814349" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966778" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC2G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="7328" Date_Added="20110223000000">
<EAN_UPCS>
<EAN_UPC Value="4713435791172" IsApproved="1" Format="GTIN-13"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="US"/>
</Country_Markets>
</file>
<file path="export/level4/US/7966779.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966778-4280.jpg" HighPicSize="793195" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966779" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-3X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 6GB (2GB x 3)" Product_View="6515" Date_Added="20110223000000">
<EAN_UPCS>
<EAN_UPC Value="4713435791714" IsApproved="1" Format="GTIN-13"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="LU"/>
<Country_Market Value="CH"/>
</Country_Markets>
</file>
<file path="export/level4/US/7966780.xml" Limited="No" HighPic="https://inishop.com/img/norm/high/7966780-2331.jpg" HighPicSize="724700" HighPicWidth="2761" HighPicHeight="1600" Product_ID="7966780" Updated="20201106094740" Quality="ICECAT" Prod_ID="AX3U1600XC4G79-2X" Supplier_id="2634" Catid="911" On_Market="1" Model_Name="XPG Xtreme Series, DDR3, 1600 MHz, CL7, 8GB (4GB x 2)" Product_View="6902" Date_Added="20110223000000">
<EAN_UPCS>
<EAN_UPC Value="4713435791707" IsApproved="1" Format="GTIN-13"/>
</EAN_UPCS>
<Country_Markets>
<Country_Market Value="LU"/>
<Country_Market Value="CH"/>
</Country_Markets>
</file>
</files.index>
</ICECAT-interface>
The goal is to be able to filter out any items that are older than a certain date, not on market, and explicitly not for sale in the US. (Items that don't have Country_Markets elements are to be included in the output.)
CodePudding user response:
Consider this adjusted iterparse
approach without the inner loops across child elements. Below uses flag variables to be turned on and off conditionally while walking down the tree. A dictionary of data is built and iteratively appended within <file>...</file>
context.
from lxml import etree
#from xml.etree import ElementTree as etree
file_path = "Input.xml"
files_data = []
context = etree.iterparse(file_path, events=("start", "end"),)
for event, elem in context:
if event == "start":
if elem.tag == 'file':
data = {}
US_Flag = 0
Date_Flag = 0
if elem.attrib['Updated'] >= "20220803143328":
Date_Flag = 1
data['IceId'] = elem.attrib['Product_ID']
data['LastUpdate'] = elem.attrib['Updated']
data['PartNum'] = elem.attrib['Prod_ID']
data['OnMarket'] = elem.attrib['On_Market']
data['EAN_UPCS'] = []
if elem.tag == "EAN_UPC" and Date_Flag == 1:
if elem.attrib['IsApproved'] == "1":
data['EAN_UPCS'].append(elem.attrib['Value'])
if elem.tag == "Country_Market" and Date_Flag == 1:
data['Country_Market'] = "US"
if elem.attrib["Value"] == "US":
US_Flag = 1
if event == "end":
if elem.tag == "file" and (US_Flag == 1 or "Country_Market" not in data):
files_data.append(data)
Output
For demonstration, below results remove >= "20220803143328"
condition since posted XML does not contain dates in that range
from pprint import pprint
pprint(files_data)
[{'Country_Market': 'US',
'EAN_UPCS': [],
'IceId': '1402',
'LastUpdate': '20220212085227',
'OnMarket': '1',
'PartNum': 'C4893A'},
{'EAN_UPCS': [],
'IceId': '1414',
'LastUpdate': '20220711134129',
'OnMarket': '1',
'PartNum': 'C6614NE'},
{'Country_Market': 'US',
'EAN_UPCS': ['4713435791172'],
'IceId': '7966778',
'LastUpdate': '20201106094740',
'OnMarket': '1',
'PartNum': 'AX3U1600XC2G79-3X'}]
CodePudding user response:
Consider iterparsing the entire XML and then filter afterwards to avoid the logic checks while traversing the varying designs of the <file>
elements. One way can be with your use of pandas.
Specifically, try the new large XML support (which uses iterparse
with either lxml and etree parsers) introduced in current v1.5 with the relatively new pandas.read_xml
IO method. See docs showing at end the parsing of Wikipedia's 12 GB XML dump in minutes (tested on a 8 GB RAM laptop)!
Also in v1.5, read_xml
now supports dtype handling like other IO modules (i.e., read_csv
, read_json
) such as converting the Updated
attribute value to datetime. You can even rename columns accordingly:
import pandas as pd
products_df = pd.read_xml(
"Input.xml",
iterparse = {"file": ["Product_ID", "Updated", "Prod_ID", "On_Market"]},
names = ["IceID", "LastUpdate", "PartNum", "OnMarket"],
parse_dates = ["LastUpdate"]
)
print(products_df)
# IceID LastUpdate PartNum OnMarket
# 0 1402 2022-02-12 08:52:27 C4893A 1
# 1 1414 2022-07-11 13:41:29 C6614NE 1
# 2 1415 2019-04-04 03:52:03 51650CE 1
# 3 7966778 2020-11-06 09:47:40 AX3U1600XC2G79-3X 1
# 4 7966779 2020-11-06 09:47:40 AX3U1600XC4G79-3X 1
# 5 7966780 2020-11-06 09:47:40 AX3U1600XC4G79-2X 1
print(products_df.dtypes)
# IceID int64
# LastUpdate datetime64[ns]
# PartNum object
# OnMarket int64
# dtype: object
Since the @value
attribute is used multiple times for EAN
and Country_Market
elements, you need to repeat with generalized renaming. Below parses the first 15 @value
elements under <file>
element. Notice, read_xml
drops all missing value columns since only max of 8 @value
is ever used!
products_df = pd.read_xml(
"Input.xml",
iterparse={
"file": [
"Product_ID", "Updated", "Prod_ID", "On_Market",
"Value", "Value", "Value", "Value", "Value",
"Value", "Value", "Value", "Value", "Value",
"Value", "Value", "Value", "Value", "Value"
]
},
names = [
"IceID", "LastUpdate", "PartNum", "OnMarket",
"Value_1", "Value_2", "Value_3", "Value_4", "Value_5",
"Value_6", "Value_7", "Value_8", "Value_9", "Value_10",
"Value_11", "Value_12", "Value_13", "Value_14", "Value_15",
],
parse_dates = ["LastUpdate"]
)
print(products_df)
# IceID LastUpdate PartNum OnMarket Value_1 Value_2 Value_3 Value_4 Value_5 Value_6 Value_7 Value_8
# 0 1402 2022-02-12 08:52:27 C4893A 1 BE FR US GB DE CH IT CA
# 1 1414 2022-07-11 13:41:29 C6614NE 1 None None None None None None None None
# 2 1415 2019-04-04 03:52:03 51650CE 1 0088698200223 088698200223 BE DE IT UA DZ None
# 3 7966778 2020-11-06 09:47:40 AX3U1600XC2G79-3X 1 4713435791172 US None None None None None None
# 4 7966779 2020-11-06 09:47:40 AX3U1600XC4G79-3X 1 4713435791714 LU CH None None None None None
# 5 7966780 2020-11-06 09:47:40 AX3U1600XC4G79-2X 1 4713435791707 LU CH None None None None None
print(products_df.dtypes)
# IceID int64
# LastUpdate datetime64[ns]
# PartNum object
# OnMarket int64
# Value_1 object
# Value_2 object
# Value_3 object
# Value_4 object
# Value_5 object
# Value_6 object
# Value_7 object
# Value_8 object
# dtype: object
From there, consider various pandas methods (DataFrame.loc
, DataFrame.query
, etc.) to filter data by LastUpdate
date, OnMarket
values, and any 'US'
or all empty two-character fields. Possibly reshape to long with DataFrame.melt
to separate and capture EAN
values.