read xml file, convert it to table (dataframe)-CodePudding

this the first time I am dealing with xml file, so I am very lost. I would appreciate any help. All I want is to read the file and convert it to regular table (dataframe).

I have file with this structure:

<?xml version="1.0"  encoding='UTF-8'?>
<LucroCliente xmlns:my='http://www.ms.com/pace' xmlns='http://www.ms.com/pace' Cab_Usuario='UsuI' Cab_DadosEmpresa='' Cab_RazaoEmpresa='CoLtda.' Cab_Aplicativo='Comercial' Cab_Data='25/07/2022 14:40:38' Cab_Titulo='Relatório Cab_Titulo' Selecao='Selecao' Periodo='Período: 01/01/2020 - 31/12/2020'>
<Filial Filial=''>
<Linha TotalLinha='TOA 21:  2.313.292,43'>
<Produto Coluna1='21-851611' Coluna2='CAMIO VO' Coluna3='' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
</Produto>
<Produto Coluna1='21-3667984' Coluna2='SCA4X2' Coluna3='-1' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
</Produto>
<Produto Coluna1='21-3667994' Coluna2='SCA963' Coluna3='-1' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
</Produto>
<Produto Coluna1='21-3676543' Coluna2='SCA713' Coluna3='-1' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
</Produto>
<Produto Coluna1='21-3676601' Coluna2='SCA97' Coluna3='-1' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
</Produto>
<Produto Coluna1='21-3814014' Coluna2='CAMIX2' Coluna3='' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
</Produto>
<Produto Coluna1='21-3814087' Coluna2='SCA56' Coluna3='' Coluna4='' Coluna5=''>
<AnaliseDiaria Coluna6='' Coluna7='' Coluna8='' Coluna9='' Coluna10='' Coluna11='' Coluna12='' Coluna13='' Coluna14='' Coluna15='' Coluna16=''/>
<AnaliseDiaria Coluna6='19/06/20' Coluna7='01' Coluna8='EP 202022777' Coluna9='1 UN' Coluna10='195.000,00' Coluna11='195.000,00' Coluna12='1' Coluna13='195.000,00' Coluna14='195.000,00' Coluna15='NF9' Coluna16='10203910A'/>
<AnaliseDiaria Coluna6='13/07/20' Coluna7='01' Coluna8='RCP G 41765' Coluna9='0 UN' Coluna10='' Coluna11='90,00' Coluna12='1' Coluna13='195.090,00' Coluna14='195.090,00' Coluna15='' Coluna16=''/>
<AnaliseDiaria Coluna6='27/07/20' Coluna7='01' Coluna8='RCP G 41767' Coluna9='0 UN' Coluna10='' Coluna11='180,00' Coluna12='1' Coluna13='195.270,00' Coluna14='195.270,00' Coluna15='' Coluna16=''/>
<AnaliseDiaria Coluna6='27/07/20' Coluna7='01' Coluna8='RCP G 41768' Coluna9='0 UN' Coluna10='' Coluna11='212,60' Coluna12='1' Coluna13='195.482,60' Coluna14='195.482,60' Coluna15='' Coluna16=''/>
<AnaliseDiaria Coluna6='27/07/20' Coluna7='01' Coluna8='RCP G 41770' Coluna9='0 UN' Coluna10='' Coluna11='145,20' Coluna12='1' Coluna13='195.627,80' Coluna14='195.627,80' Coluna15='' Coluna16=''/>
<AnaliseDiaria Coluna6='27/07/20' Coluna7='01' Coluna8='RCP G 41771' Coluna9='0 UN' Coluna10='' Coluna11='8.902,02' Coluna12='1' Coluna13='204.529,82' Coluna14='204.529,82' Coluna15='' Coluna16=''/>
<AnaliseDiaria Coluna6='27/07/20' Coluna7='01' Coluna8='VP 323755' Coluna9='-1 UN' Coluna10='204.529,82' Coluna11='-204.529,82' Coluna12='0' Coluna13='' Coluna14='' Coluna15='' Coluna16='158PES'/>
</Produto>
</Linha>
</Filial>
</LucroCliente>

I tried multiple solutions I found here but nothing worked out, for example: first solution:

xml_data = open('file.xml', 'r').read()
root = et.XML(xml_data)  # Parse XML

data = []
cols = []
for i, child in enumerate(root):
    data.append([subchild.text for subchild in child])
    cols.append(child.tag)
df = pd.DataFrame(data).T
df.columns = cols

second solution:

xml_data = objectify.parse('file.xml') 
root = xml_data.getroot()  

data = []
cols = []
for i in range(len(root.getchildren())):
    child = root.getchildren()[i]
    data.append([subchild.text for subchild in child.getchildren()])
    cols.append(child.tag)
df = pd.DataFrame(data).T
df.columns = cols

My end table will look like below:

| Coluna1   | Coluna2  | Coluna3 | Coluna4 | coluna2 | couna6 | coluna7 | coluna8 | coluna9 | colun10 | coluna11 | coluna12 | coluna13 | coluna14 | coluna15 | coluna16 |
| --------- | -------- | ------- | ------- | ------- | ------ | ------- | ------- | ------- | ------- | -------- | -------- | -------- | -------- | -------- | -------- |
| 21-851611 | CAMIO VO |         |         |         |        |         |         |         |         |          |          |          |          |          |          |
|           |          |         |         |         |        |         |         |         |         |          |          |          |          |          |          |
|           |          |         |         |         |        |         |         |         |         |          |          |          |          |          |          |
|           |          |         |         |         |        |         |         |         |         |          |          |          |          |          |          |

CodePudding user response：

import xml.etree.ElementTree as ET
import csv
import numpy as np
from tqdm import tqdm
path = 'Filename.xml'

from lxml import etree

all_columns = {
# column names for the dataframe
}

context = etree.iterparse(path,events=('end',), tag='row')

def find_missing_keys(input_keys, target_keys):
    return list(set(target_keys) - set(input_keys))

with open('Filename.csv', 'w', encoding="utf-8") as csvFile:
    writer = csv.DictWriter(csvFile, fieldnames=list(all_columns))
    writer.writeheader()

    for i, ret in tqdm(enumerate(context)):
        event, element = ret
        row = dict(element.attrib)
        missing_keys = find_missing_keys(list(row.keys()), list(all_columns))
        for each_missing_key in missing_keys:
            row[each_missing_key] = np.nan
        writer.writerow(row)
        element.clear()
        while element.getprevious() is not None:
            del element.getparent()[0]

Hope this helps!

CodePudding user response：

Fortunately, in the case of your xml in the question, you can use the pandas read_xml() method, although you'll have to skirt around the namespaces issue:

import pandas as pd
pd.read_xml(file.xml,xpath='//*[local-name()="Linha"]//*[local-name()="Produto"]')

Output:

    Coluna1        Coluna2    Coluna3     Coluna4   Coluna5     {http://www.ms.com/pace}AnaliseDiaria
0   21-851611   CAMIO VO    NaN     NaN     NaN     NaN
1   21-3667984  SCA4X2  -1.0    NaN     NaN     NaN
2   21-3667994  SCA963  -1.0    NaN     NaN     NaN

etc. If you are not interested in one column or anothter, you can simply drop() it.

CodePudding user response：

Given the two levels of nodes that cover the Coluna attributes, consider XSLT, the special-purpose language designed to transform or style original XML files. Python's lxml can run XSLT 1.0 scripts and being the default parse to pandas.read_xml can transform your raw XML into a flatter version to parse to DataFrame.

XSLT (save as .xsl file, a special .xml file)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
                              xmlns:pace='http://www.ms.com/pace'>
    <xsl:output method="xml" omit-xml-declaration="no" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <!-- REDESIGN XML TO ONLY RETURN AnaliseDiaria NODES -->
    <xsl:template match="/*">
     <xsl:copy>
       <xsl:apply-templates select="descendant::pace:AnaliseDiaria"/>
     </xsl:copy>
    </xsl:template>
    
    <!-- REDESIGN AnaliseDiaria NODES -->
    <xsl:template match="pace:AnaliseDiaria">
     <xsl:copy>
       <!-- BRING DOWN Produto ATTRIBUTES WITH CURRENT ATTRIBUTES -->
       <xsl:copy-of select="ancestor::pace:Produto/@*|@*"/>
     </xsl:copy>
    </xsl:template>
    
</xsl:stylesheet>

Online Demo

Python

analise_diaria_df = pd.read("input.xml", stylesheet="style.xsl")

analise_diaria_df 
#        Coluna1   Coluna2  Coluna3  ...    Coluna14  Coluna15   Coluna16
# 0    21-851611  CAMIO VO      NaN  ...         NaN       NaN        NaN
# 1   21-3667984    SCA4X2     -1.0  ...         NaN       NaN        NaN
# 2   21-3667994    SCA963     -1.0  ...         NaN       NaN        NaN
# 3   21-3676543    SCA713     -1.0  ...         NaN       NaN        NaN
# 4   21-3676601     SCA97     -1.0  ...         NaN       NaN        NaN
# 5   21-3814014    CAMIX2      NaN  ...         NaN       NaN        NaN
# 6   21-3814087     SCA56      NaN  ...         NaN       NaN        NaN
# 7   21-3814087     SCA56      NaN  ...  195.000,00       NF9  10203910A
# 8   21-3814087     SCA56      NaN  ...  195.090,00       NaN        NaN
# 9   21-3814087     SCA56      NaN  ...  195.270,00       NaN        NaN
# 10  21-3814087     SCA56      NaN  ...  195.482,60       NaN        NaN
# 11  21-3814087     SCA56      NaN  ...  195.627,80       NaN        NaN
# 12  21-3814087     SCA56      NaN  ...  204.529,82       NaN        NaN
# 13  21-3814087     SCA56      NaN  ...         NaN       NaN     158PES