I've written a small python app to print some XML tags and select child attributes. The XML are for electronic invoicing here in Mexico, here is an example of the XML:
<?xml version="1.0" encoding="UTF-8"?><cfdi:Comprobante xmlns:cfdi="http://www.sat.gob.mx/cfd/4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="4.0" xsi:schemaLocation="http://www.sat.gob.mx/cfd/4 http://www.sat.gob.mx/sitio_internet/cfd/4/cfdv40.xsd" Serie="V" Folio="10030062" Fecha="2022-11-09T18:55:51" Sello="kWjohv/nlmGBVrIUBxeULiiF2HiUGxAsDC4FTirGnF8GMD7tTVDwpzDOVcyJJupQYJKj/xRPIz46i1RjZYX2jIskXxJwb5QkWfSUC6rO3TdHr4nqJQnLCD2cdp66u2/v 8uYJv as7uXvuGv1JwQ67Mg037b0IPTjHPKaZvRwIBQCrLukLB4bOX8yuBGWWqrAqJPR/eS/wRt3QedyBhUIbUsebRgtirOQ0ywarSPUJ9Dll0KmaWq3rrHN jkoAUZSgy mJoR2WldeIbuiHXml/QXezl4o34ICK32gYyzvzrpLTslxPYTKcoKzDvGo2jK5/T7NctbNrrH29i515lugg==" FormaPago="04" NoCertificado="00001000000503805521" Certificado="MIIGITCCBAmgAwIBAgIUMDAwMDEwMDAwMDA1MDM4MDU1MjEwDQYJKoZIhvcNAQELBQAwggGEMSAwHgYDVQQDDBdBVVRPUklEQUQgQ0VSVElGSUNBRE9SQTEuMCwGA1UECgwlU0VSVklDSU8gREUgQURNSU5JU1RSQUNJT04gVFJJQlVUQVJJQTEaMBgGA1UECwwRU0FULUlFUyBBdXRob3JpdHkxKjAoBgkqhkiG9w0BCQEWG2NvbnRhY3RvLnRlY25pY29Ac2F0LmdvYi5teDEmMCQGA1UECQwdQVYuIEhJREFMR08gNzcsIENPTC4gR1VFUlJFUk8xDjAMBgNVBBEMBTA2MzAwMQswCQYDVQQGEwJNWDEZMBcGA1UECAwQQ0lVREFEIERFIE1FWElDTzETMBEGA1UEBwwKQ1VBVUhURU1PQzEVMBMGA1UELRMMU0FUOTcwNzAxTk4zMVwwWgYJKoZIhvcNAQkCE01yZXNwb25zYWJsZTogQURNSU5JU1RSQUNJT04gQ0VOVFJBTCBERSBTRVJWSUNJT1MgVFJJQlVUQVJJT1MgQUwgQ09OVFJJQlVZRU5URTAeFw0yMDA0MTYyMDE1MTdaFw0yNDA0MTYyMDE1MTdaMIHvMTAwLgYDVQQDEydQUkVNSVVNIFJFU1RBVVJBTlQgQlJBTkRTIFMgREUgUkwgREUgQ1YxMDAuBgNVBCkTJ1BSRU1JVU0gUkVTVEFVUkFOVCBCUkFORFMgUyBERSBSTCBERSBDVjEwMC4GA1UEChMnUFJFTUlVTSBSRVNUQVVSQU5UIEJSQU5EUyBTIERFIFJMIERFIENWMSUwIwYDVQQtExxQUkIxMDA4MDJIMjAgLyBSQVpFNjUwNTAzVUY4MR4wHAYDVQQFExUgLyBSQVpFNjUwNTAzSE5FTUNMMDcxEDAOBgNVBAsUB1BSQl9GQUMwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCacpMWcQqSuS0mc8CfDLBvhLqPL5LyxYcEi/TYqHpje3DeVkkB6uYB19 3MO3oTnGnZgt7Jhs6/eM1 3ch/4EnAxUvVbBAHaXUUmRHTXGwBgqRMHgYYQ/DwsKHjL2fQoCodxSsJCKSg93GO4JXXHIFITALb9aOmPLd/hRc4krOqZT2egVL/HrIY 4Y2L9y9HEH B8HUC5tbmsal5V9XNQs86nSg8Zc8IPUNMhWRQtKwdIwDwCTccYTTiBK7O2ykiba6/Ef3ORb1bDHv8YSzfjnNpD/yhXn3PyCKR9KjXp1dxGyFsEbqZH5SwUp5/aDDXetI1dal7GYSxqYA54BRKQFAgMBAAGjHTAbMAwGA1UdEwEB/wQCMAAwCwYDVR0PBAQDAgbAMA0GCSqGSIb3DQEBCwUAA4ICAQAU8WJ25ANnPSd09lBj1XsKcDlREx1zr3Tlw9UrFIZZJdsd2f0BeJFtolsWO3afHiVcpk5IfUshjI9fe/uzm8AbbMPpaoBhywoHTBJiG4bGkwQpVEddjufDKKxkuao NALpwhfFc8kNJTmG0FuOYEVU7pKh/gz2kZOhcKViXGt3OQYLlUZ6 PP99Z2AePkz2x6gtC A20oxfDLkPXqtEez2mby//bUSgtGsFWTIkrtC7Zro47zNOCYDngKWoke4T91o8xTtcABoeRlZTDovLCFsVm0zg5Cd22PWFkfIvlVZyIRSlJcrq2P3fo0fzeQ rG CpntIfOrYZr5eQHOOLUMPavazsTvFDJQpnCbZnNIxnaMKAPmXbgJHyMx24tARd0rGEuM/KLn/ZW2TCUAD5mofsT6 Z/EMsAZ68Tv4ZbwcPlWDbuJEUTsK/z2angnO55xA NdPz MltizMUcKXjzzvUanOAXQNIHD2wEbyXHpD3Ytb6BU6OOAx7HNiBnokxkyr7riD/slEL/di09S3Po3Q5X0z4ygUh2lHyxJDDJtNYiYLsscbliVVk0BtPAuTidOlLutw9N19zSE4AZgzIhwIF7oiJlM4EytSIZsM6GUWniN4 tWRDoV sEgpKnblH4ms3OHB3ZE5LsgHAjcfFyToVaA3GpzLJSkQawmhc ylw==" SubTotal="145.69" Moneda="MXN" Total="169.00" TipoDeComprobante="I" Exportacion="01" MetodoPago="PUE" LugarExpedicion="11520"><cfdi:Emisor Rfc="PRB100802H20" Nombre="PREMIUM RESTAURANT BRANDS" RegimenFiscal="601"/><cfdi:Receptor Rfc="IST190806QJ7" Nombre="INDRA SISTEMAS TRANSPORTE Y DEFENSA" DomicilioFiscalReceptor="11520" RegimenFiscalReceptor="601" UsoCFDI="G03"/><cfdi:Conceptos><cfdi:Concepto ClaveProdServ="90101503" NoIdentificacion="0385101372231252" ObjetoImp="02" Cantidad="1" ClaveUnidad="XPK" Unidad="Paquete" Descripcion="PQT. DE ALIMENTOS (CONSUMO: 2022-11-08) FOLIO(0385101372231252)" ValorUnitario="145.69" Importe="145.69"><cfdi:Impuestos><cfdi:Traslados><cfdi:Traslado Base="145.69" Impuesto="002" TipoFactor="Tasa" TasaOCuota="0.160000" Importe="23.31"/></cfdi:Traslados></cfdi:Impuestos></cfdi:Concepto></cfdi:Conceptos><cfdi:Impuestos TotalImpuestosTrasladados="23.31"><cfdi:Traslados><cfdi:Traslado Base="145.69" Impuesto="002" TipoFactor="Tasa" TasaOCuota="0.160000" Importe="23.31"/></cfdi:Traslados></cfdi:Impuestos><cfdi:Complemento><tfd:TimbreFiscalDigital xmlns:tfd="http://www.sat.gob.mx/TimbreFiscalDigital" FechaTimbrado="2022-11-09T19:05:56" UUID="67B2DDD8-ABCF-4CD1-B435-C228742542B6" NoCertificadoSAT="00001000000503270882" SelloCFD="kWjohv/nlmGBVrIUBxeULiiF2HiUGxAsDC4FTirGnF8GMD7tTVDwpzDOVcyJJupQYJKj/xRPIz46i1RjZYX2jIskXxJwb5QkWfSUC6rO3TdHr4nqJQnLCD2cdp66u2/v 8uYJv as7uXvuGv1JwQ67Mg037b0IPTjHPKaZvRwIBQCrLukLB4bOX8yuBGWWqrAqJPR/eS/wRt3QedyBhUIbUsebRgtirOQ0ywarSPUJ9Dll0KmaWq3rrHN jkoAUZSgy mJoR2WldeIbuiHXml/QXezl4o34ICK32gYyzvzrpLTslxPYTKcoKzDvGo2jK5/T7NctbNrrH29i515lugg==" SelloSAT="LBOVbhfMGU8T2Tsrz6fFTLkCz90Z0sZIkJqLquayWD5GIhdw6UDvp2Lo5r40jjGC1WwvHMsimi6Ho5xMH70nHH9gkeUIRK3BdsPcUjwFSnYzL1TwG70ZGf7hFBh8uflI1jKzLPRFvWhfHyw1Wznof9NtlXCvSRYhmlcxM6/kj/gOOG0hrq DJaEsTNJgD7XQzUlMJ9/Casc2kgvOYAdwpXdmkNEtEe9oqQiti4VbPXxEUKpE66hik/Rg4txFMCTPlAMpiz3XfDig/gp6lrFnb/TYkSFr3E9/oPJxoig4xTwPuCZ9uOxfExpxtI3ASXpCoh4isWqqlgxc7abxIoA5Tw==" Version="1.1" RfcProvCertif="TLE011122SC2" xsi:schemaLocation="http://www.sat.gob.mx/TimbreFiscalDigital http://www.sat.gob.mx/sitio_internet/cfd/TimbreFiscalDigital/TimbreFiscalDigitalv11.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"/></cfdi:Complemento><cfdi:Addenda><Referencia xmlns="https://facturacion.prb.com.mx/XSD/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="https://facturacion.prb.com.mx/XSD/ https://facturacion.prb.com.mx/XSD/prb_addenda.xsd"
ticket="0385101372231252"/></cfdi:Addenda></cfdi:Comprobante>
Here is the code I've written:
while True:
import xml.dom.minidom
import csv
from tkinter import *
from tkinter import filedialog
root = Tk()
root.filename = filedialog.askopenfilename(title = "Select file", filetypes =[('XML Files', '*.xml')])
print (root.filename)
root.mainloop()
print("------------------------------------------------------------------------------")
def main():
# use the parse() function to load and parse an XML file
doc = xml.dom.minidom.parse(root.filename);
# print out the document node and the name of the first child tag
print (doc.nodeName)
print (doc.firstChild.tagName)
# get a list of XML tags from the document and print each one
cfd = doc.getElementsByTagName("cfdi:Comprobante")
print ("%d Monto:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Total"))
cfd = doc.getElementsByTagName("cfdi:Comprobante")
print ("%d Fecha:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Fecha"))
cfd = doc.getElementsByTagName("cfdi:Concepto")
print ("%d Descripción:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Descripcion"))
cfd = doc.getElementsByTagName("cfdi:Emisor")
print ("%d RFC_Emisor:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Rfc"))
cfd = doc.getElementsByTagName("cfdi:Emisor")
print ("%d Emisor:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Nombre"))
cfd = doc.getElementsByTagName("tfd:TimbreFiscalDigital")
print ("%d UUID:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("UUID"))
cfd = doc.getElementsByTagName("cfdi:Receptor")
print ("%d RFC_Receptor:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Rfc"))
cfd = doc.getElementsByTagName("cfdi:Receptor")
print ("%d Receptor:" % cfd.length)
for skill in cfd:
print (skill.getAttribute("Nombre"))
print("------------------------------------------------------------------------------")
if __name__ == "__main__":
main();
try_again = int(input("Press 1 to try again, 0 to exit."))
if try_again == 0:
break # break out of the outer while loop
Now I'm trying to update it to write all to a CSV file, I've tried with the following code
import xml.etree.ElementTree as Xet
import pandas as pd
import xml.dom.minidom
import csv
from tkinter import *
from tkinter import filedialog
root1 = Tk()
root1.filename = filedialog.askopenfilename(title = "Select file", filetypes =[('XML Files', '*.xml')])
print (root1.filename)
cols = ["Monto","Fecha","Descripcion","RFC_Emisor","Emisor","UUID","RFC_Receptor","Receptor"]
rows = []
# use the parse() function to load and parse an XML file
doc = xml.dom.minidom.parse(root1.filename);
Total = doc.getElementsByTagName("cfdi:Comprobante")
for skill in Total:
print (skill.getAttribute("Total"))
Date = doc.getElementsByTagName("cfdi:Comprobante")
for skill in Date:
print (skill.getAttribute("Fecha"))
Desc = doc.getElementsByTagName("cfdi:Concepto")
for skill in Desc:
print (skill.getAttribute("Descripcion"))
RFC1 = doc.getElementsByTagName("cfdi:Emisor")
for skill in RFC1:
print (skill.getAttribute("Rfc"))
Name = doc.getElementsByTagName("cfdi:Emisor")
for skill in Name:
print (skill.getAttribute("Nombre"))
UUI = doc.getElementsByTagName("tfd:TimbreFiscalDigital")
for skill in UUI:
print (skill.getAttribute("UUID"))
RFC2 = doc.getElementsByTagName("cfdi:Receptor")
for skill in RFC2:
print (skill.getAttribute("Rfc"))
Name2 = doc.getElementsByTagName("cfdi:Receptor")
for skill in Name2:
print (skill.getAttribute("Nombre"))
#Parsing the XML file
xmlparse = Xet.parse(root1.filename)
root = xmlparse.getroot()
for i in root:
Monto = Total
Fecha = Date
Descripcion = Desc
RFC_Emisor = RFC1
Emisor = Name
UUID = UUI
RFC_Receptor = RFC2
Receptor = Name2
rows.append({"Monto": Monto,
"Fecha": Fecha,
"Descripcion": Descripcion,
"RFC_Emisor": RFC_Emisor,
"Emisor": Emisor,
"UUID": UUID,
"RFC_Receptor": RFC_Receptor,
"Receptor": Receptor})
df= pd.DataFrame(rows, columns=cols)
df.to_csv('output.csv')
But the CSV file only writes this: ,Monto,Fecha,Descripcion,RFC_Emisor,Emisor,UUID,RFC_Receptor,Receptor 0,[<DOM Element: cfdi:Comprobante at 0x2c76ad9a030>],[<DOM Element: cfdi:Comprobante at 0x2c76ad9a030>],[<DOM Element: cfdi:Concepto at 0x2c76adbdef0>],[<DOM Element: cfdi:Emisor at 0x2c76adbdbd0>],[<DOM Element: cfdi:Emisor at 0x2c76adbdbd0>],[<DOM Element: tfd:TimbreFiscalDigital at 0x2c76adbe5d0>],[<DOM Element: cfdi:Receptor at 0x2c76adbdd10>],[<DOM Element: cfdi:Receptor at 0x2c76adbdd10>]
I know I'm parsing the XML twice but cannot figure out how to do it
The expected CSV should look like this:
,Monto,Fecha,Descripcion,RFC_Emisor,Emisor,UUID,RFC_Receptor,Receptor 0,169.00,2022-11-09T18:55:51,PQT. DE ALIMENTOS (CONSUMO: 2022-11-08) FOLIO(0385101372231252),PRB100802H20,PREMIUM RESTAURANT BRANDS,67B2DDD8-ABCF-4CD1-B435-C228742542B6,IST190806QJ7,INDRA SISTEMAS TRANSPORTE Y DEFENSA
EDIT 2022-11-14
I've tried with this code but cannot get the Total or the Fecha value
import pandas as pd
import xml.dom.minidom
import xml.etree.ElementTree as Xet
from tkinter import *
from tkinter import filedialog
from lxml import etree
root1 = Tk()
root1.filename = filedialog.askopenfilename(title="Select file", filetypes=[('XML Files','*.xml')])
print (root1.filename)
cols = ["Monto","Fecha","Descripcion","RFC_Emisor","Emisor","UUID"]
rows = []
row =[]
doc = xml.dom.minidom.parse(root1.filename);
xmlparse = Xet.parse(root1.filename)
root = xmlparse.getroot()
for m in root.findall('.//*[@Total]'):
row.extend(m.attrib.get("Total")) #,m.attrib.get("Fecha")))
for d in root.findall('.//*[@Descripcion]'):
row.append(d.attrib.get('Descripcion'))
for rf in root.findall('.//*[@Rfc]'):
row.extend((rf.attrib.get("Rfc"),rf.attrib.get("Nombre")))
for u in root.findall('.//*[@UUID]'):
row.append(u.attrib.get("UUID"))
rows.append(row)
df= pd.DataFrame(rows,columns=cols)
df.to_csv('output.csv',mode='a', index=False, header=False)
CodePudding user response:
It looks like your code is way more complicated than necessary.
Try it this way:
from lxml import etree
cols = ["Monto","Fecha","Descripcion","RFC_Emisor","Emisor","UUID","RFC_Receptor","Receptor"]
rows = []
row =[]
for t in root.xpath('//*[@Total]'):
row.extend((t.attrib.get("Total"),t.attrib.get("Fecha")))
for d in doc.xpath('//*[@Descripcion]'):
row.append(d.attrib.get('Descripcion'))
for rf in doc.xpath('//*[@Rfc]'):
row.extend((rf.attrib.get("Rfc"),rf.attrib.get("Nombre")))
for u in doc.xpath('//*[@UUID]'):
row.append(u.attrib.get("UUID"))
rows.append(row)
pd.DataFrame(rows,columns=cols)
Output (based on your sample xml):
Monto Fecha Descripcion RFC_Emisor Emisor UUID RFC_Receptor Receptor
0 169.00 2022-11-09T18:55:51 PQT. DE ALIMENTOS (CONSUMO: 2022-11-08) FOLIO(... PRB100802H20 PREMIUM RESTAURANT BRANDS IST190806QJ7 INDRA SISTEMAS TRANSPORTE Y DEFENSA 67B2DDD8-ABCF-4CD1-B435-C228742542B6
You'll likely have to modify this to fit your actual xml.