I'm trying to parse a xml file (6 gb) using python and save the elements in a list/table. I want to save into a table, ideally a dataframe the concatenation of the First name, middle name and surname in order to have a full name if it is available, and the active status.
<Person id="44855" action="chg" date="26-Aug-2022">
<Gender>Male</Gender>
<ActiveStatus>Active</ActiveStatus>
<Deceased>No</Deceased>
<NameDetails>
<Name NameType="Primary Name">
<NameValue>
<FirstName>*****</FirstName>
<Surname>******</Surname>
</NameValue>
</Name>
<Name NameType="Low Quality AKA">
<NameValue>
<FirstName>****</FirstName>
</NameValue>
</Name>
<Name NameType="Spelling Variation">
<NameValue>
<FirstName>*****</FirstName>
<Surname>****</Surname>
</NameValue>
</Name>
</NameDetails>
</Person>
I've tried to parse it using xml.etree.ElementTree parse but i got memory error, i've tried pandas read_xml, but got memory error. I don't have permission to install lxml, so i cant use lxml.etree. This is my first time uploading a question, and im not sure how to do it correctly, but feel free to ask any question, I really need help here.
The code i have so far is this (it was reused from a similar question on thi plataform)
import xml.sax
from xml.sax.handler import ContentHandler
class XmlHandler( xml.sax.ContentHandler ):
def __init__(self):
self.CurrentData = ""
self.name = ""
self.namevalue = ""
self.firstname = ""
self.middlename = ""
self.surname = ""
self.activestatus = ""
self.data = {} #dict
self.list = [] #list to store information
self.list2 = []
self.list3 = []
self.list4 = []
self.list5 = []
self.list6 = []
self.list7 = []
self.list8 = []
self.list9 = []
self.list10 = []
# Call when an element starts
def startElement(self, tag, attributes):
self.CurrentData = tag
if tag == "Person":
id = attributes["id"]
self.list.append(id)
self.data['id'] = self.list
# Call when an elements ends
def endElement(self, tag):
if self.CurrentData == "Name":
name = self.name
self.list3.append(name)
self.data['Name'] = self.list3
elif self.CurrentData == "NameValue":
namevalue = self.namevalue
self.list4.append(namevalue)
self.data['NameValue'] = self.list4
elif self.CurrentData == "FirstName":
firstname = self.firstname
self.list5.append(firstname)
self.data['FirstName'] = self.list5
elif self.CurrentData == "MiddleName":
middlename = self.middlename
self.list6.append(middlename)
self.data['MiddleName'] = self.list6
elif self.CurrentData == "Surname":
surname = self.surname
self.list7.append(surname)
self.data['Surname'] = self.list7
self.list9.append(firstname ' ' middlename ' ' surname)
self.data['Full name'] = self.list9
elif self.CurrentData == "ActiveStatus":
activestatus = self.activestatus
self.list8.append(activestatus)
self.data['ActiveStatus'] = self.list8
self.CurrentData = ""
# Call when a character is read
def characters(self, content):
if self.CurrentData == "Name":
self.name = content
elif self.CurrentData == "ActiveStatus":
self.activestatus = content
elif self.CurrentData == "NameValue":
self.namevalue = content
elif self.CurrentData == "FirstName":
self.firstname = content
elif self.CurrentData == "MiddleName":
self.middlename = content
elif self.CurrentData == "Surname":
self.surname = content
if __name__ == '__main__':
parser = xml.sax.make_parser()
handler = XmlHandler()
parser.setContentHandler(handler)
parser.parse(source,encoding = 'utf8'))
CodePudding user response:
I think at this size it is the easiest to write a custom XML-processor,
xml.entree seems really inefficient for larger files.
Here is an article that shows how this could work.
CodePudding user response:
Try following power shell script
using assembly System.Xml
using assembly System.Xml.Linq
$Filename = "c:\temp\test.xml"
$reader = [System.Xml.XmlReader]::Create($Filename)
while($reader.EOF -eq $False)
{
if($reader.Name -ne "Person")
{
$reader.ReadToFollowing("Person")
}
if($reader.EOF -eq $False)
{
$xPerson = [System.Xml.Linq.XElement]::ReadFrom($reader)
$names = $xPerson.Descendants("Name")
foreach($name in $Names)
{
$nameType = $name.Attribute("NameType").Value
$firstName = $name.Descendants("FirstName").Value
$surName = $name.Descendants("Surname").Value
Write-Host "type = " $nameType, " first = " $firstName, " sur = "$surName
}
}
}