Python xml to dataframe include text after br tag-CodePudding

I am trying to transform a xml file into a dataframe. The original xml file is: https://www.assemblee-nationale.fr/dyn/opendata/CRSANR5L15S2017E1N001.xml . Here is an example:

<?xml version='1.0' encoding='UTF-8'?>
<compteRendu xmlns="http://schemas.assemblee-nationale.fr/referentiel">
  <contenu>
    <point nivpoint="1" valeur_ptsodj="2" ordinal_prise="1" id_preparation="819547" ordre_absolu_seance="8" code_grammaire="TITRE_TEXTE_DISCUSSION" code_style="Titre" code_parole="" sommaire="1" id_syceron="981344" valeur="">
      <orateurs/>
      <texte>Déclaration de...</texte>
      <paragraphe valeur_ptsodj="2" ordinal_prise="1" id_preparation="819550" ordre_absolu_seance="11" id_acteur="PA345619" id_mandat="-1" id_nomination_oe="PM725692" id_nomination_op="-1" code_grammaire="DEBAT_1_10" code_style="NORMAL" code_parole="PAROLE_1_2" sommaire="1" id_syceron="981347" valeur="">
        <orateurs>
          <orateur>
            <nom>M. President</nom>
          </orateur>
        </orateurs>
        <texte>Today we are...
               <exposant>er</exposant>
               Prime-minister will 
               <br/>
               speak.
        </texte>
      </paragraphe>
    </point>
  </contenu>
</compteRendu>

My code:

import pandas as pd
import xml.etree.ElementTree as et

tree = ET.parse('file.xml')
root = tree.getroot()
d = {'contenu':['nom','texte']}
cols, data = list(), list()

# loop through d.items
for k, v in d.items():
    # find child
    child = root.find(f'{{*}}{k}')
    # use iter to check each descendant (`elem`)
    for elem in child.iter():
        # get `tag_end` for each descendant,
        tag_end = elem.tag.split('}')[-1]  
        # check if `tag_end` in `v(alue)`
        if tag_end in v:
            # add `tag_end` and `elem.text` to appropriate list
            cols.append(tag_end)
            data.append(elem.text)
df = pd.DataFrame(data).T

# Obtain columns names
def f(lst):
    d = {}
    out = []
    for i in lst:
        if i not in d:
            out.append(i)
            d[i] = 2
        else:
            out.append(i str(d[i]))
            d[i]  = 1
    return out
df.columns = f(cols)
df.columns = f(cols)
df=df.rename(columns={"nom": "nom1"})
df.rename(columns={"texte" str(i): "texte" str(i-1) for i in range(2,10000)}, inplace=True)
df=df.rename(columns={"texte": "texte0"})
df.drop([col for col in df.columns if col.startswith("nom") and df[col].isnull().all()], axis=1, inplace=True)

What I am obtaining:

texte0            nom1          texte1
Déclaration de... M. President Today we are...\n

In column 2 it is missing the text "Prime-minister will speak." Because of the <br> and <exposant> tag, only the first line is showing up. How I should change my code?

(In the end, I will transform my dataframe from wide to long, so that I have one column with 'nom' and another with 'texte', the person and his respective text.)

CodePudding user response：

You can use a recursive function to get text of all elements with the required tags and tail of their children:

import pandas as pd
import xml.etree.ElementTree as ET

tree = ET.parse('file.xml')
root = tree.getroot()
tags = ['nom','texte']

def get_content_recursively(element, tags, get_tail=False):
    data = list()
    _, _, tag = element.tag.rpartition('}')
    if tag in tags and element.text and element.text.strip():
        data.append(element.text.strip())

    for el in element:
        data  = get_content_recursively(el, tags, get_tail=(tag in tags))
    
    if get_tail and element.tail and element.tail.strip():
        data.append(element.tail.strip())
    return data

df = pd.DataFrame(get_content_recursively(root, tags)).T

Output:

                   0             1                2                    3       4
0  Déclaration de...  M. President  Today we are...  Prime-minister will  speak.

Note. data.append(element.text.strip()) strips whitespaces (including new lines) from the result. Remove strip() to keep them

Edit: if you want to join all strings of an element, you can handle its text element and tail of its children in a loop:

import pandas as pd
import xml.etree.ElementTree as ET

tree = ET.parse('file.xml')
root = tree.getroot()
tags = ['nom','texte']

def get_content_recursively(element, tags):
    data = []
    _, _, tag = element.tag.rpartition('}')
    if tag in tags:
        tag_str_lst = []
        if element.text and element.text.strip():
            tag_str_lst.append(element.text.strip())
        for el in element:
            if el.tail and el.tail.strip():
                tag_str_lst.append(el.tail.strip())
        data.append(" ".join(tag_str_lst))

    for el in element:
        data  = get_content_recursively(el, tags)
    
    return data

df = pd.DataFrame(get_content_recursively(root, tags)).T

Output:

                   0             1                                           2
0  Déclaration de...  M. President  Today we are... Prime-minister will speak.