Home > Software design >  How do you read, process and write content of a non-standard formatted xml
How do you read, process and write content of a non-standard formatted xml

Time:03-21

I'm trying to process the content of a Language-XML-File in C# for machine translations.

The content of <seg-source> Segments should be translated and written back to the <target> segments. The formatting of tags inside the source or target segments should stay the same.

My first problem is, that the xml file is not correctly read because of the start and end tags not being <xml> and </xml>. Replacing the first two lines of text with the <xml>-tag does not work because the original XML-File is all written in one line (The following example is formatted for better reading).

Is there an easy way to copy all source information that should be translated to an array and write it back after I've processed it?

This is what the XML-Files (.sdlxliff) look like:

<?xml version="1.0" encoding="utf-8"?>
<xliff xmlns:sdl="http://sdl.com/FileTypes/SdlXliff/1.0" xmlns="urn:oasis:names:tc:xliff:document:1.2" version="1.2" sdl:version="1.0">
    <file original="" datatype="x-sdlfilterframework2" source-language="de-DE" target-language="en-US">
        <header>
            <file-info xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
                <value key="SDL:FileId">77260240-fccf-4e75-81e3-7a1ab00fe948</value>
                <value key="SDL:CreationDate">03/18/2022 16:00:07</value>
                <value key="SDL:OriginalFilePath"></value>
                <value key="SDL:FileTypeDllVersion">1.8.2.0</value>
                <value key="SDL:OriginalEncoding">utf-8</value>
                <value key="SDL:AutoClonedFlagSupported">True</value>
                <value key="HasUtf8Bom">False</value>
                <value key="LineBreakType">
</value>
                <value key="ParagraphTextDirections"/>
                <sniff-info>
                    <detected-encoding detection-level="Likely" encoding="utf-8"/>
                    <detected-source-lang detection-level="Guess" lang="de-DE"/>
                    <props>
                        <value key="HasUtf8Bom">False</value>
                        <value key="LineBreakType">
</value>
                    </props>
                </sniff-info>
            </file-info>
            <sdl:filetype-info>
                <sdl:filetype-id>Plain Text v 1.0.0.0</sdl:filetype-id>
            </sdl:filetype-info>
            <tag-defs xmlns="http://sdl.com/FileTypes/SdlXliff/1.0">
                <tag id="0">
                    <st name="^">^</st>
                </tag>
                <tag id="1">
                    <st name="$">$</st>
                </tag>
                <tag id="2">
                    <st name="^">^</st>
                </tag>
                <tag id="3">
                    <st name="$">$</st>
                </tag>
                <tag id="4">
                    <st name="^">^</st>
                </tag>
                <tag id="5">
                    <st name="$">$</st>
                </tag>
            </tag-defs>
        </header>
        <body>
            <trans-unit translate="no" id="08c58142-03fe-4aad-8bc6-64e45600e91c">
                <source>
                    <x id="0"/>
                </source>
            </trans-unit>
            <trans-unit id="038509df-7f97-4faa-867f-ec00a1290f62">
                <source>Ein Satz zu übersetzen</source>
                <seg-source>
                    <mrk mtype="seg" mid="1">Ein Satz zu übersetzen</mrk>
                </seg-source>
                <target>
                    <mrk mtype="seg" mid="1"/>
                </target>
                <sdl:seg-defs>
                    <sdl:seg id="1"/>
                </sdl:seg-defs>
            </trans-unit>
            <trans-unit translate="no" id="b3f5e43b-6bba-41e4-a9fd-b7e4077694cc">
                <source>
                    <x id="1"/>
                    <x id="2"/>
                </source>
            </trans-unit>
            <trans-unit id="4c7dcbe2-1ebe-4e56-bb9a-2fe647b12f1f">
                <source>Ein zweiter Satz zu übersetzen</source>
                <seg-source>
                    <mrk mtype="seg" mid="2">Ein zweiter Satz zu übersetzen</mrk>
                </seg-source>
                <target>
                    <mrk mtype="seg" mid="2"/>
                </target>
                <sdl:seg-defs>
                    <sdl:seg id="2"/>
                </sdl:seg-defs>
            </trans-unit>
            <trans-unit translate="no" id="0ca0c301-f5a2-44e8-8754-7618c98e14c6">
                <source>
                    <x id="3"/>
                    <x id="4"/>
                </source>
            </trans-unit>
            <trans-unit id="5b3973af-b0cf-4dcf-b66c-aea309389c2d">
                <source>Ein letzter weiterer Satz zu übersetzen</source>
                <seg-source>
                    <mrk mtype="seg" mid="3">Ein letzter weiterer Satz zu übersetzen</mrk>
                </seg-source>
                <target>
                    <mrk mtype="seg" mid="3"/>
                </target>
                <sdl:seg-defs>
                    <sdl:seg id="3"/>
                </sdl:seg-defs>
            </trans-unit>
            <trans-unit translate="no" id="1cced868-b401-45c5-be2b-ea1fede236c0">
                <source>
                    <x id="5"/>
                </source>
            </trans-unit>
        </body>
    </file>
</xliff>

This is my code for reading the file, but I have no clue how to deal with tags in side the source segments and I guess there must be a better way to replace the start tag:

    string fileContents = File.ReadAllText(ofd_ToTranslate.FileName);

    fileContents = fileContents.Replace("<?xml version=\"1.0\" encoding=\"utf - 8\"?><xliff xmlns:sdl=\"http://sdl.com/FileTypes/SdlXliff/1.0\" xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\" sdl:version=\"1.0\">", "<xml>");
    fileContents = fileContents.Replace("</xliff>", "</xml>");

    XmlReaderSettings settings = new XmlReaderSettings { NameTable = new NameTable() };
    XmlNamespaceManager xmlns = new XmlNamespaceManager(settings.NameTable);
    xmlns.AddNamespace("sdl", "");
    XmlParserContext context = new XmlParserContext(null, xmlns, "", XmlSpace.Default);
    XmlReader reader = XmlReader.Create(new StringReader(fileContents), settings, context);
    XmlDocument xmlDoc = new XmlDocument();

    xmlDoc.Load(reader);

    XmlNodeList sourceElements = xmlDoc.GetElementsByTagName("source");
    XmlNodeList targetElements = xmlDoc.GetElementsByTagName("target");

CodePudding user response:

Your XML is perfectly fine, but it has a default namespace:

xmlns="urn:oasis:names:tc:xliff:document:1.2"

To access the nodes you need to use the namespace.

Here's an example:

var xd = XDocument.Load(@"file.xml");
var xn = XNamespace.Get("urn:oasis:names:tc:xliff:document:1.2");
var tus = xd.Root?.Descendants(xn   "trans-unit");
Console.WriteLine(tus.Count());

That outputs 7 for me.

CodePudding user response:

You can use xml serialization

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Serialization;

namespace ConsoleApp1
{
    class Program
    {
        const string FILENAME = @"c:\temp\test.xml";
        static void Main(string[] args)
        {
            XmlReader reader = XmlReader.Create(FILENAME);
            XmlSerializer serializer = new XmlSerializer(typeof(Xliff));
            Xliff xliff = (Xliff)serializer.Deserialize(reader);
        }
    }
    [XmlRoot(ElementName = "xliff", Namespace = "urn:oasis:names:tc:xliff:document:1.2")]
    public class Xliff
    {
        [XmlElement("file")]
        public File file { get; set; }
    }
    public class File
    {
        [XmlAttribute()]
        public string datatype { get; set; }
        [XmlAttribute("source-language")]
        public string sourceLanguage { get; set; }

        [XmlElement("header")]
        public Header header { get; set; }
        [XmlElement("body")]
        public Body body { get; set; }
    }
    public class Header
    { 
        [XmlElement(ElementName = "file-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public FileInfo fileInfo { get; set; }
        [XmlElement(ElementName = "filetype-info", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public FileType fileType { get; set; }
        [XmlArray(ElementName = "tag-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        [XmlArrayItem(ElementName = "tag")]
        public List<Tag> tags { get; set; }
    }
    public class FileInfo
    {
        [XmlElement(ElementName = "value", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public List<Value> values { get; set; }
        [XmlElement(ElementName = "sniff-info")]
        public SniffInfo sniffInfo { get; set; }
    }
    public class Value
    {
        [XmlAttribute()]
        public string key { get; set; }
        [XmlText]
        public string value { get; set; }
    }
    public class SniffInfo
    {
        [XmlElement(ElementName = "detected-encoding")]
        public Encoding encoding { get; set; }
        [XmlElement(ElementName = "detected-source-lang")]
        public Source source { get; set; }
        [XmlArray("props")]
        [XmlArrayItem("value")]
        public List<Value> values { get; set; }

    }
    public class Encoding
    {
        [XmlAttribute("detection-level")]
        public string detectionLevel { get; set; }
        [XmlAttribute()]
        public string encoding { get; set; }
    }
    public class Source
    {
        [XmlAttribute("detection-level")]
        public string detectionLevel { get; set; }
        [XmlAttribute()]
        public string lang { get; set; }
    }
    public class FileType
    {
        [XmlElement(ElementName = "filetype-id")]
        public string id { get; set; }
    }
    public class Tag
    {
        [XmlAttribute("id")]
        public int id { get; set; }
        [XmlElement(ElementName = "st")]
        public St st { get; set; }
    }
    public class St
    {
        [XmlAttribute()]
        public string name { get; set; }
        public string value { get; set; }
    }
    public class Body
    {
        [XmlElement(ElementName = "trans-unit")]
        public List<TransUnit> transUnits { get; set; }
    }
    public class TransUnit
    {
        [XmlAttribute()]
        public string translate { get; set; }
        [XmlAttribute()]
        public string id { get; set; }
        [XmlArray("source")]
        [XmlArrayItem("x")]
        public List<X> xs { get; set; }
        [XmlElement(ElementName = "seg-source")]
        public SegSource segSource { get; set; }

        public Target target { get; set; }
        [XmlElement(ElementName = "seg-defs", Namespace = "http://sdl.com/FileTypes/SdlXliff/1.0")]
        public SegDefs segDeg { get; set; }
    }
    public class X
    {
        [XmlAttribute("id")]
        public int id { get; set; }
    }
    public class SegSource
    {
        public Mrk mrk { get; set; }
    }
    public class Target
    {
        public Mrk mrk { get; set; }
    }
    public class Mrk
    {
        [XmlAttribute()]
        public string mtype { get; set; }
        [XmlAttribute()]
        public string mid { get; set; }
        [XmlText]
        public string value { get; set; }
    }
    public class SegDefs
    {
        public Seg seg { get; set; }
    }
    public class Seg
    {
        [XmlAttribute()]
        public int id { get; set; }
    }
}
  • Related