I'm currently working with my script that can extract or compile all of data the from xml to csv, right now my script can run without error but I still need to find a way to get all the data in xml.
My script can able to compile the data and converted to csv but only first data in the xml can extract when the data converted to csv.
May I know what I need to change on my script?
columns = ["DESCRIPTION", "PRODUCT", "ID", "STATUS", "TYPE", "DELIVERYTIME", "DELIVERED", "REMARKS", "NUM"]
rows = []
xml_comp = BeautifulSoup(open('SAMPLE-22.xml'), 'xml')
DESCRIPTION = xml_comp.DESCRIPTION
PRODUCT = xml_comp.PRODUCT
ID = xml_comp.ID
STATUS = xml_comp.STATUS
TYPE = xml_comp.TYPE
DELIVERYTIME = xml_comp.DELIVERYTIME
DELIVERED = xml_comp.DELIVERED
REMARKS = xml_comp.DELIVERED
NUM = xml_comp.NUM
rows.append({
"DESCRIPTION": DESCRIPTION.text,
"PRODUCT": PRODUCT.text,
"ID" : ID.text,
"STATUS": STATUS.text,
"TYPE": TYPE.text,
"DELIVERYTIME": DELIVERYTIME.text,
"DELIVERED": DELIVERED.text,
"REMARKS": REMARKS.text,
"NUM": NUM.text})
df = pd.DataFrame(rows, columns=cols)
df.to_csv(f"myFileReport.csv")
XML DATA
<?xml version="1.0" encoding="UTF-8"?>
<QuerySAMPLEResponse
xmlns="http://www.sample.com"
xmlns:xsi="http://www.SAMPLE.com/2000/XMLSchema-instance" creationDateTime="2000-98-01T12:14:27 0" transLanguage="EN" baseLanguage="EN" rsStart="0">
<SAMPLEONLYSet>
<DESCRIPTION rowstamp="3894417710">
<DELIVERYTIME>2000-96-210 0</DELIVERYTIME>
<DESCRIPTION>PRODUCT10</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>PENDING</STATUS>
<TYPE>COMP10158533</TYPE>
<DESCRIPTIONSPEC rowstamp="3853626366">
<TESTVALUE>TEST-01</TESTVALUE>
<PRODID>TEST001</PRODID>
</DESCRIPTIONSPEC>
<DESCRIPTIONSPEC rowstamp="3853626367">
<TESTVALUE>PRODUCT 26</TESTVALUE>
<PRODID>TEST153</PRODID>
</DESCRIPTIONSPEC>
<DESCRIPTIONSPEC rowstamp="3853616252">
<PRODID>TEST418</PRODID>
</DESCRIPTIONSPEC>
<DESCRIPTION rowstamp="3853617719">
<DESCRIPTION>PRODUCT 25</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158534</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617720">
<DESCRIPTION>PRODUCT 24</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158536</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617721">
<DESCRIPTION>PRODUCT 23</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158543</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617722">
<DESCRIPTION>PRODUCT 22</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158549</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3894417704">
<DELIVERED>2000-97-230 0</DELIVERED>
<DELIVERYTIME>2000-96-210 0</DELIVERYTIME>
<DESCRIPTION> WINDOW TYPE</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<NUM>2000-97-240 0</NUM>
<REMARKS>2000-96-200 0</REMARKS>
<ID>1000</ID>
<STATUS>CLOSED</STATUS>
<TYPE>COMP10158557</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3894417711">
<DESCRIPTION>PRODUCT123</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<NUM>2000-99-080 0</NUM>
<REMARKS>2000-96-020 0</REMARKS>
</DESCRIPTION>
</SAMPLEONLYSet>
</QuerySAMPLEONLYResponse>
CodePudding user response:
Since your XML is pretty flat, consider the new pandas.read_xml
(introduced in v1.3) with default lxml
as parser. This method can even handle default namespaces as included in your markup.
Assuming below fixed markup for proper XML:
XML
<?xml version="1.0" encoding="UTF-8"?>
<QuerySAMPLEResponse xmlns="http://www.sample.com" xmlns:xsi="http://www.SAMPLE.com/2000/XMLSchema-instance" creationDateTime="2000-98-01T12:14:27 0" transLanguage="EN" baseLanguage="EN" rsStart="0">
<SAMPLEONLYSet>
<DESCRIPTION rowstamp="3894417710">
<DELIVERYTIME>2000-96-210 0</DELIVERYTIME>
<DESCRIPTION>PRODUCT10</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>PENDING</STATUS>
<TYPE>COMP10158533</TYPE>
<DESCRIPTIONSPEC rowstamp="3853626366">
<TESTVALUE>TEST-01</TESTVALUE>
<PRODID>TEST001</PRODID>
</DESCRIPTIONSPEC>
<DESCRIPTIONSPEC rowstamp="3853626367">
<TESTVALUE>PRODUCT 26</TESTVALUE>
<PRODID>TEST153</PRODID>
</DESCRIPTIONSPEC>
<DESCRIPTIONSPEC rowstamp="3853616252">
<PRODID>TEST418</PRODID>
</DESCRIPTIONSPEC>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617719">
<DESCRIPTION>PRODUCT 25</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158534</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617720">
<DESCRIPTION>PRODUCT 24</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158536</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617721">
<DESCRIPTION>PRODUCT 23</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158543</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3853617722">
<DESCRIPTION>PRODUCT 22</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<ID>1000</ID>
<STATUS>COMPLETE</STATUS>
<TYPE>COMP10158549</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3894417704">
<DELIVERED>2000-97-230 0</DELIVERED>
<DELIVERYTIME>2000-96-210 0</DELIVERYTIME>
<DESCRIPTION>WINDOW TYPE</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<NUM>2000-97-240 0</NUM>
<REMARKS>2000-96-200 0</REMARKS>
<ID>1000</ID>
<STATUS>CLOSED</STATUS>
<TYPE>COMP10158557</TYPE>
</DESCRIPTION>
<DESCRIPTION rowstamp="3894417711">
<DESCRIPTION>PRODUCT123</DESCRIPTION>
<PRODUCT>DESKTOP733</PRODUCT>
<NUM>2000-99-080 0</NUM>
<REMARKS>2000-96-020 0</REMARKS>
</DESCRIPTION>
</SAMPLEONLYSet>
</QuerySAMPLEResponse>
Python
description_df = pd.read_xml(
xml, xpath=".//doc:SAMPLEONLYSet/doc:DESCRIPTION", namespaces={"doc": "http://www.sample.com"}
))
description_df
# rowstamp DELIVERYTIME DESCRIPTION PRODUCT ID STATUS TYPE DESCRIPTIONSPEC DELIVERED NUM REMARKS
# 0 3894417710 2000-96-210 0 PRODUCT10 DESKTOP733 1000.0 PENDING COMP10158533 NaN None None None
# 1 3853617719 None PRODUCT 25 DESKTOP733 1000.0 COMPLETE COMP10158534 NaN None None None
# 2 3853617720 None PRODUCT 24 DESKTOP733 1000.0 COMPLETE COMP10158536 NaN None None None
# 3 3853617721 None PRODUCT 23 DESKTOP733 1000.0 COMPLETE COMP10158543 NaN None None None
# 4 3853617722 None PRODUCT 22 DESKTOP733 1000.0 COMPLETE COMP10158549 NaN None None None
# 5 3894417704 2000-96-210 0 WINDOW TYPE DESKTOP733 1000.0 CLOSED COMP10158557 NaN 2000-97-230 0 2000-97-240 0 2000-96-200 0
# 6 3894417711 None PRODUCT123 DESKTOP733 NaN None None NaN None 2000-99-080 0 2000-96-020 0
CodePudding user response:
Specifically, with your implementation, you only receive the first set of data because you are not iterating through all <DESCRIPTION>
elements. Usually parsed data will return as an iterable like list.
Additionally, in your learning curve of Python and XML, consider a more Pythonic and XML parsing approach which includes:
- Use a conformant XML library such as the built-in
etree
or third-partylxml
which helps handle default namespaces as your current XML contains. In fact,BeautifulSoup
defaults tolxml
under the hood. - Use a defined namespace prefix to parse nodes under default namespace. See
namespaces
argument of lxml'sxpath
or etree'sfindall
. - Use list/dict comprehension rather than the bookkeeping of initializing list and appending items.
- Avoid repetitive code with defined methods such as parsing text on optional nodes that may not exist across all
<DESCRIPTION>
elements. Per your error, you receive.text
can only be available if element exists, so check if node exists prior to parsing text. - Use a conditional expression (or ternary operator) to concisely assign a value like checking existence of element.
import pandas as pd
import lxml.etree as lx
doc = lx.parse('SAMPLE-22.xml')
nmsp = {"doc": "http://www.sample.com"}
# DEFINED METHOD (WITH CONDITIONAL EXPRESSION) TO EXTRACT ELEMENT TEXT
def get_xml_text(row, node):
return (
row.find(node, namespaces=nmsp).text
if row.find(node, namespaces=nmsp) is not None
else None
)
# BUILD LIST OF DICTIONARIES
rows = [
{
"DESCRIPTION": get_xml_text(row, "doc:DESCRIPTION"),
"PRODUCT": get_xml_text(row, "doc:PRODUCT"),
"ID": get_xml_text(row, "doc:ID"),
"STATUS": get_xml_text(row, "doc:STATUS"),
"TYPE": get_xml_text(row, "doc:TYPE"),
"DELIVERYTIME": get_xml_text(row, "doc:DELIVERYTIME"),
"DELIVERED": get_xml_text(row, "doc:DELIVERED"),
"REMARKS": get_xml_text(row, "doc:REMARKS"),
"NUM": get_xml_text(row, "doc:NUM")
}
for row in doc.xpath(
".//doc:SAMPLEONLYSet/doc:DESCRIPTION", namespaces=nmsp
)
]
# BIND LIST INTO DATA FRAME
description_df = pd.DataFrame(rows)
Output
print(description_df)
# DESCRIPTION PRODUCT ID ... DELIVERED REMARKS NUM
# 0 PRODUCT10 DESKTOP733 1000 ... None None None
# 1 PRODUCT 25 DESKTOP733 1000 ... None None None
# 2 PRODUCT 24 DESKTOP733 1000 ... None None None
# 3 PRODUCT 23 DESKTOP733 1000 ... None None None
# 4 PRODUCT 22 DESKTOP733 1000 ... None None None
# 5 WINDOW TYPE DESKTOP733 1000 ... 2000-97-230 0 2000-96-200 0 2000-97-240 0
# 6 PRODUCT123 DESKTOP733 None ... None 2000-96-020 0 2000-99-080 0
Just recently learned about findtext
which avoids the conditional expression!
rows = [
{
"DESCRIPTION": row.findtext("doc:DESCRIPTION", namespaces=nmsp),
"PRODUCT": row.findtext("doc:PRODUCT", namespaces=nmsp),
"ID": row.findtext("doc:ID", namespaces=nmsp),
"STATUS": row.findtext("doc:STATUS", namespaces=nmsp),
"TYPE": row.findtext("doc:TYPE", namespaces=nmsp),
"DELIVERYTIME": row.findtext("doc:DELIVERYTIME", namespaces=nmsp),
"DELIVERED": row.findtext("doc:DELIVERED", namespaces=nmsp),
"REMARKS": row.findtext("doc:REMARKS", namespaces=nmsp),
"NUM": row.findtext("doc:NUM", namespaces=nmsp),
}
for row in doc.xpath(
".//doc:SAMPLEONLYSet/doc:DESCRIPTION", namespaces=nmsp
)
]
df = pd.DataFrame(rows)