Home > Net >  parse encode xml to xml by xslt 3.0
parse encode xml to xml by xslt 3.0

Time:08-31

Hello here is my encoded xml extracted from email

<?xml version="1.0" encoding="utf-8"?>
<message_root>
    <message>
        <to>
            <displayName>abc</displayName>
            <email>abc</email>
            <name>abc</name>
        </to>
        <from>
            <displayName>abc</displayName>
            <email>abc</email>
            <name>abc</name>
        </from>
        <return-path>abc</return-path>
        <date>abc</date>
        <subject>abc</subject>
        <mime-version>1.0</mime-version>
        <message-id>&lt;abc&gt;</message-id>
        <body_html>&lt;html dir="ltr"&gt;
&lt;head&gt;
&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"&gt;
&lt;style type="text/css" id="owaParaStyle"&gt;&lt;/style&gt;
&lt;/head&gt;
&lt;body fpstyle="1" ocsi="0"&gt;
&lt;div style="direction: ltr;font-family: Tahoma;color: #000000;font-size: 10pt;"&gt;Hello alfjskfslfkjsjsf
&lt;div&gt;Attr A: Hello my name is&lt;/div&gt;
&lt;div&gt;Attr B: ABCXYZ&lt;/div&gt;
&lt;div&gt;Attr C: 5&lt;/div&gt;
&lt;div&gt;Attr D: Mr.ABC&lt;/div&gt;
&lt;div&gt;Thank you so much&lt;/div&gt;
&lt;/div&gt;
&lt;/body&gt;
&lt;/html&gt;
</body_html>
        <body_text />
    </message>
</message_root>

the xml I want

<?xml version="1.0" encoding="utf-8"?>
<message_root>
    <message>
        <to>
            <displayName>abc</displayName>
            <email>abc</email>
            <name>abc</name>
        </to>
        <from>
            <displayName>abc</displayName>
            <email>abc</email>
            <name>abc</name>
        </from>
        <return-path>abc</return-path>
        <date>abc</date>
        <subject>abc</subject>
        <mime-version>1.0</mime-version>
        <message-id>abc</message-id>
        <body_html>
            <AttrA> Hello my name is </AttrA>
            <AttrB> ABCXYZ </AttrB>
            <AttrC> 5 </AttrC>
            <AttrD> Mr.ABC </AttrD>
        </body_html>
        <body_text />
    </message>
</message_root>

I used this xslt 3.0 to use parse-xml to decode body_html part

<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="3.0"
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes" />
    <xsl:template match="message_root">
        <message_root>
            <xsl:apply-templates select="message" />
        </message_root>
    </xsl:template>
    <xsl:template match="message">
        <message>
            <xsl:apply-templates select="body_text" />
            <datasource>Inbox</datasource>
            <source>Test</source>
            <xsl:copy-of select="subject" />
            <xsl:copy-of select="date" />
            <xsl:copy-of select="from" />
            <xsl:copy-of select="to" />
            <xsl:copy-of select="parse-xml(body_html)" />
            <messageid>
                <xsl:value-of select="substring-before(translate(translate(message-id,'&lt;',''),'&gt;',''),'@')" />
            </messageid>
            <xsl:variable name="div" select="html/body/div/div" />
            <AttrA>
                <xsl:value-of select="substring-after($div[starts-with(., 'Attr A:')], ':')" />
            </AttrA>            
        </message>
    </xsl:template>
    
</xsl:stylesheet>

But AttrA returns empty value. How can I get the xml I want? Thank you so much.

I have to add this part to get enough character aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

CodePudding user response:

You ask how to parse encoded XML, but your input contains encoded HTML that isn't well-formed XML. It cannot be parsed using the XSLT 3.0 parse-xml() function.

In the absence of an HTML parser (such as https://www.saxonica.com/documentation11/index.html#!functions/saxon/parse-html), you need to resort to a more primitive method:

<xsl:stylesheet version="3.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>

<xsl:template match="message">
    <xsl:copy>
        <xsl:copy-of select="* except (body_html | message-id)"/>
        <messageid>
            <xsl:value-of select="substring-before(substring-after(message-id, '&lt;'), '&gt;')" />
        </messageid>
        <body_html>
            <AttrA>
                <xsl:value-of select="substring-before(substring-after(body_html, '&lt;div&gt;Attr A:'), '&lt;/div&gt;')" />
            </AttrA>
            <!-- ... -->
        </body_html>
    </xsl:copy>
</xsl:template>

</xsl:stylesheet>

If you are able to provide an input that contains a string representing a well-formed XML such as:

<message_root>
    <message>
        <to>
            <displayName>abc</displayName>
            <email>abc</email>
            <name>abc</name>
        </to>
        <from>
            <displayName>abc</displayName>
            <email>abc</email>
            <name>abc</name>
        </from>
        <return-path>abc</return-path>
        <date>abc</date>
        <subject>abc</subject>
        <mime-version>1.0</mime-version>
        <message-id>&lt;abc&gt;</message-id>
        <body_html>&lt;html dir="ltr"&gt;
&lt;head&gt;
&lt;meta http-equiv="Content-Type" content="text/html; charset=utf-8"/&gt;
&lt;style type="text/css" id="owaParaStyle"&gt;&lt;/style&gt;
&lt;/head&gt;
&lt;body fpstyle="1" ocsi="0"&gt;
&lt;div style="direction: ltr;font-family: Tahoma;color: #000000;font-size: 10pt;"&gt;Hello alfjskfslfkjsjsf
&lt;div&gt;Attr A: Hello my name is&lt;/div&gt;
&lt;div&gt;Attr B: ABCXYZ&lt;/div&gt;
&lt;div&gt;Attr C: 5&lt;/div&gt;
&lt;div&gt;Attr D: Mr.ABC&lt;/div&gt;
&lt;div&gt;Thank you so much&lt;/div&gt;
&lt;/div&gt;
&lt;/body&gt;
&lt;/html&gt;
</body_html>
        <body_text />
    </message>
</message_root>

then you can do:

<xsl:stylesheet version="3.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>

<xsl:template match="message">
    <xsl:copy>
        <xsl:copy-of select="* except (body_html | message-id)"/>
        <messageid>
            <xsl:value-of select="substring-before(substring-after(message-id, '&lt;'), '&gt;')" />
        </messageid>
        <body_html>
            <xsl:variable name="div" select="parse-xml(body_html)/html/body/div/div" />
            <AttrA>
                <xsl:copy-of select="substring-after($div[starts-with(., 'Attr A:')], 'Attr A:')" />
            </AttrA>
            <!-- ... -->
        </body_html>
    </xsl:copy>
</xsl:template>

</xsl:stylesheet>

CodePudding user response:

<xsl:copy-of select="parse-xml(body_html)" />

This results in error, because the string-value of the body_html element is not a well-formed XML document.

One malformed-ness is that the <meta> element has only an opening tag but no closing tag:

<meta http-equiv="Content-Type" content="text/html; charset=utf-8">

Hope this explains the problem.

One possible solution:

You may want to use EXPath, which provides a module for parsing HTML.


If the problem with the meta element is fixed as below (for readability the text is shown in a CDATA section):

<message_root>
  <message>
    <to>
      <displayName>abc</displayName>
      <email>abc</email>
      <name>abc</name>
    </to>
    <from>
      <displayName>abc</displayName>
      <email>abc</email>
      <name>abc</name>
    </from>
    <return-path>abc</return-path>
    <date>abc</date>
    <subject>abc</subject>
    <mime-version>1.0</mime-version>
    <message-id>&lt;abc&gt;</message-id>
    <body_html>
      <![CDATA[
<html dir="ltr">
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
        <style type="text/css" id="owaParaStyle"></style>
    </head>
    <body fpstyle="1" ocsi="0">
        <div style="direction: ltr;font-family: Tahoma;color: #000000;font-size: 10pt;">Hello alfjskfslfkjsjsf
            <div>Attr A: Hello my name is</div>
            <div>Attr B: ABCXYZ</div>
            <div>Attr C: 5</div>
            <div>Attr D: Mr.ABC</div>
            <div>Thank you so much</div></div>
    </body>
</html>]]>

    </body_html>
    <body_text />
  </message>
</message_root>

and the transformation is updated accordingly:

<xsl:stylesheet version="3.0"
  xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output method="xml" indent="yes" />
  <xsl:template match="message_root">
    <message_root>
      <xsl:apply-templates select="message" />
    </message_root>
  </xsl:template>
  <xsl:template match="message">
    <message>
      <xsl:apply-templates select="body_text" />
      <datasource>Inbox</datasource>
      <source>Test</source>
      <xsl:copy-of select="subject" />
      <xsl:copy-of select="date" />
      <xsl:copy-of select="from" />
      <xsl:copy-of select="to" />
      <xsl:variable name="hDoc" select="parse-xml(body_html)"/>
      <xsl:copy-of select="$hDoc/*" />
      <messageid>
        <xsl:value-of select="substring-before(translate(translate(message-id,'&lt;',''),'&gt;',''),'@')" />
      </messageid>
      <xsl:variable name="div" select="$hDoc/html/body/div/div" />
      <AttrA>
        <xsl:value-of select="substring-after($div[starts-with(., 'Attr A:')], ':')" />
      </AttrA>            
    </message>
  </xsl:template>  
</xsl:stylesheet>

then the seemingly wanted result is produced:

<?xml version="1.0" encoding="UTF-8"?>
<message_root>
   <message>
      <datasource>Inbox</datasource>
      <source>Test</source>
      <subject>abc</subject>
      <date>abc</date>
      <from>
         <displayName>abc</displayName>
         <email>abc</email>
         <name>abc</name>
      </from>
      <to>
         <displayName>abc</displayName>
         <email>abc</email>
         <name>abc</name>
      </to>
      <html dir="ltr">
         <head>
            <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
            <style type="text/css" id="owaParaStyle"/>
         </head>
         <body fpstyle="1" ocsi="0">
            <div style="direction: ltr;font-family: Tahoma;color: #000000;font-size: 10pt;">Hello alfjskfslfkjsjsf
            <div>Attr A: Hello my name is</div>
               <div>Attr B: ABCXYZ</div>
               <div>Attr C: 5</div>
               <div>Attr D: Mr.ABC</div>
               <div>Thank you so much</div>
            </div>
         </body>
      </html>
      <messageid/>
      <AttrA> Hello my name is</AttrA>
   </message>
</message_root>
  • Related