def intr_docs(xml_doc):
attr = xml_doc.attrib
for chld in root:
for value in chld:
for xml in xml_doc.iter('timestep'):
doc_dict = attr.copy()
doc_dict.update(xml.attrib)
doc_dict['time'] = chld.attrib['time']
doc_dict['id'] = value.attrib['id']
doc_dict['speed'] = value.attrib['speed']
doc_dict['lane'] = value.attrib['lane']
yield doc_dict
doc_df = pd.DataFrame(list(intr_docs(root)))
<timestep time="28800.00"/>
<timestep time="28890.00">
<vehicle id="800002" x="5534.41" y="3530.10" angle="243.59" type="HV" speed="14.58" pos="5.10" lane="-gneE13_2" slope="0.00"/>
<vehicle id="800003" x="4190.43" y="3359.53" angle="88.95" type="HV" speed="12.64" pos="5.10" lane="-5088609#2_0" slope="0.00"/>
<vehicle id="800017" x="1972.35" y="437.35" angle="306.80" type="HV" speed="15.17" pos="5.10" lane="-gneE5_2" slope="0.00"/>
<vehicle id="800021" x="9.34" y="-1.68" angle="42.62" type="HV" speed="13.32" pos="5.10" lane="-gneE6_1" slope="0.00"/>
<vehicle id="800034" x="2616.22" y="1599.61" angle="318.00" type="HV" speed="14.54" pos="5.10" lane="-gneE2_0" slope="0.00"/>
<vehicle id="800053" x="2915.19" y="2618.33" angle="290.01" type="HV" speed="16.52" pos="5.10" lane="-6200994#2_0" slope="0.00"/>
</timestep>
I am new to python. I have XML file (attached image)1 and want to convert it into the dataframe (attached image)2. I got the code and converted the file (13 MB) as I wanted but in the case of a large file (more than 500 MB) it is taking a lot of time. I am wondering it is due to code or my PC capacity. So how can I do it.
CodePudding user response:
Probably the default read_xml() method will perform better:
import pandas as pd
df = pd.read_xml(xml, xpath="//vehicle", attrs_only=True)
Using xpath
parameter tells pandas to use each vehicle
element as a dataframe row, attrs_only
parameter tells pandas to use attributes as row fields. This approach will take all attributes of vehicle
element, unnecessary fields can be dropped after dataframe creation.
Solution using SAX parser:
import xml.sax
from collections import defaultdict
import pandas as pd
# class handling SAX events
class TimestepHandler(xml.sax.ContentHandler):
# dictionary containing data for dataframe
data = defaultdict(list)
def __init__(self):
self.data = defaultdict(list)
self.time = None
# start element event handler
def startElement(self, tag, attributes):
# on 'timestep' element save time value
if tag == 'timestep':
self.time = attributes['time']
# on 'vehicle' element add attribute values to the dictionary
# reusing 'time' value from last 'timestep' element
if tag == 'vehicle':
self.data['time'].append(self.time)
self.data['id'].append(attributes['id'])
self.data['x'].append(attributes['x'])
self.data['y'].append(attributes['y'])
self.data['angle'].append(attributes['angle'])
self.data['type'].append(attributes['type'])
self.data['speed'].append(attributes['speed'])
self.data['pos'].append(attributes['pos'])
self.data['lane'].append(attributes['lane'])
self.data['slope'].append(attributes['slope'])
# create SAX parser with custom handler
parser = xml.sax.make_parser()
handler = TimestepHandler()
parser.setContentHandler(handler)
# parse XML file
parser.parse("file.xml")
# create pandas dataframe from dictionary
df = pd.DataFrame(handler.data)
df
Solution using lxml:
import pandas as pd
from lxml import etree
from collections import defaultdict
d = defaultdict(list)
doc = etree.parse('file.xml')
# retrieve 'vehicle' elements using xpath
vehicles = doc.xpath('.//vehicle')
# iterate over 'vehicle' elements
for v in vehicles:
d['id'].append(v.get('id'))
d['time'].append(v.xpath('../@time')[0])
d['x'].append(v.get('x'))
d['y'].append(v.get('y'))
d['angle'].append(v.get('angle'))
d['type'].append(v.get('type'))
d['speed'].append(v.get('speed'))
d['pos'].append(v.get('pos'))
d['lane'].append(v.get('lane'))
d['slope'].append(v.get('slope'))
# create pandas dataframe from dictionary
df = pd.DataFrame(d)
df