I will split a large XML to small branches and than parse only this parts. I search modified timestamp "mod_time" tag which is avaliable in "contacts" tag, but my object function call, doesn't find the value. In some contacts is also some tags missing completly.
I tried iterfind('tag_name')
, iter()
, findall('tag_name')
, but my program shows no result and I can't figure out for hours, where my failure is.
Here is my XML reduced to two elements:
<?xml version="1.0" encoding = "utf-8"?>
<phonebooks>
<phonebook name="Telefonbuch">
<contact>
<category>0</category>
<person>
<realName>Dummy, Name, Street</realName>
</person>
<telephony nid="1">
<number type="work" prio="1" id="0">012345678</number>
</telephony>
<services />
<setup />
<features doorphone="0" />
<mod_time>1587477163</mod_time>
<uniqueid>358</uniqueid>
</contact>
<contact>
<category>0</category>
<person>
<realName>Foto Name</realName>
</person>
<telephony nid="1">
<number type="home" prio="1" id="0">067856743</number>
</telephony>
<services />
<setup />
<features doorphone="0" />
<mod_time>1547749691</mod_time>
<uniqueid>68</uniqueid>
</contact>
</phonebook>
</phonebooks>
and her what I have done so fare:
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, xml_file, tag_node):
"""Split tree in contact branches """
self.xml_file = xml_file
self.tag_node = tag_node
# For furter parsing
contacts = []
i = 0
events =('start','end','start-ns','end-ns')
for event, elem in ET.iterparse(self.xml_file, events=events):
if event == 'end' and elem.tag == self.tag_node[0]:
#print(elem.tag)
contacts.append(elem)
par = Contact(elem, i)
par.parse_node(elem, i)
i = 1
elem.clear()
print("Amount of contacts:", len(contacts))
class Contact:
def __init__(self, branch, i):
self.tree = branch
#print(i, self.tree)
def parse_node(self, branch, i):
for node in branch.iterfind('.//mod_time'):
print(node.text)
def main():
elem = Phonebook('new _dummy1.xml',['contact'])
if __name__ == '__main__':
""" Input XML file definition """
starttime=timeit.default_timer()
main()
print('Finished')
print("Runtime:", timeit.default_timer()-starttime)
Output:
Amount of contacts: 2 Finished Runtime: 0.0006361000050674193
Expected output:
1587477163 1547749691
CodePudding user response:
Code
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, xml_file, selector):
self.xml_file = xml_file
self.selector = selector
root = ET.parse(xml_file)
contacts = root.findall(selector)
print("Amount of contacts:", len(contacts))
for mod_time in contacts:
print(mod_time.text)
def main():
Phonebook('./_dummy1.xml','.//contact/mod_time')
if __name__ == '__main__':
starttime=timeit.default_timer()
main()
print('Finished')
print("Runtime:", timeit.default_timer()-starttime)
Output
$ python test.py
Amount of contacts: 2
1587477163
1547749691
Finished
Runtime: 0.0006627999973716214
CodePudding user response:
I solved now my issue with the handshake of the object data. I post my solution, because it's maybe interessting for others who run in similar issues. Thanks to all who tried to help!
My changed code:
import psutil
import timeit
import xml.etree.ElementTree as ET
class Phonebook:
def __init__(self, file_path):
"""Split tree in contact branches """
self.file_path = file_path
def contacts_list(self, file_path):
contacts = []
events =('start','end','start-ns','end-ns')
for event, elem in ET.iterparse(self.file_path, events=events):
if event == 'end' and elem.tag == 'contact':
contact = elem
contacts.append(contact)
elem.clear()
return contacts
#print("Superclass:",contacts)
class Contact(Phonebook):
def __init__(self, file_path):
super().__init__(file_path)
def search_node(self, contact, searched_tag):
contact_template =['category','person', 'telephony', 'services', 'setup', 'features', 'mod_time', 'uniqueid' ]
node_tag_list = []
list_difference = []
search_list = []
for node in contact:
if node.tag not in node_tag_list:
node_tag_list.append(node.tag)
for element in contact_template:
if element not in node_tag_list:
list_difference.append(element)
for node in contact:
if node.tag == searched_tag and node.tag not in list_difference:
search_list.append(node.text)
#print(node.text)
else:
if len(list_difference) != 0 and searched_tag in list_difference:
message = self.missed_tag(list_difference)
#print(message)
if message not in search_list:
search_list.append(message)
return search_list
def missed_tag(self, list_difference):
for m in list_difference:
message = f'{m} - not assigned'
return message
def main():
con = Contact('dummy.xml')
contacts = con.contacts_list(('dummy.xml'))
mod_time_list =[]
for contact in contacts:
mod_time = con.search_node(contact, 'mod_time')
mod_time_list.append(mod_time)
print(len(mod_time_list))
print(mod_time_list)
if __name__ == '__main__':
""" Input XML file definition """
starttime=timeit.default_timer()
main()
print('Finished')
# Getting % usage of virtual_memory ( 3rd field)
print('RAM memory % used:', psutil.virtual_memory()[2])
# Getting usage of virtual_memory in GB ( 4th field)
print('RAM Used (GB):', psutil.virtual_memory()[3]/1000000000)
print("Runtime:", timeit.default_timer()-starttime)