Home > Software design >  Sax Parser preserve space in content
Sax Parser preserve space in content

Time:10-17

I have a file xml like this

<?xml version="1.0" encoding="UTF-8"?>
<main>
    <tagA>190</tagA>
    <tagB>  :  </tagB>
    <tagc>2019-07-02</tagc>
</main>

note that before tagA and tagB there are four spaces " ", before tagC there is tab "\t"

i want preserve spaces in tagB content, so the expect are

tagA "190"
tagB "  :  "
tagC "2019-07-02"

My code:

public class FurmaxXmlHandler extends DefaultHandler {

    private boolean isTextNode = false;
    private StringBuilder textNode = new StringBuilder();

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {

        isTextNode = true;
        textNode.append(ch, start, length);
        // to test only
        StringBuilder tempBuilder = new StringBuilder();
        tempBuilder.append(ch, start, length);
        String temp = tempBuilder.toString();
        System.out.println("single call characters -> "   temp   "<-");
    }

    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
        // Delegate to active handlers and deletes them if they are finished...

        if (isTextNode) {
            String data = textNode.toString();
            System.out.println("value of "   name   " ->"   data   "<-");
            textNode = new StringBuilder();
            isTextNode = false;
        }

    }

    public static void main(String[] args) {

        try {
            File file = new File(
                    "C:\\Workspace\\Progetti\\ProveJavaGradle\\src\\main\\java\\it\\furmax\\xml\\test.xml");
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser saxParser = factory.newSAXParser();
            XMLReader reader = saxParser.getXMLReader();
            FurmaxXmlHandler handler = new FurmaxXmlHandler();
            reader.setContentHandler(handler);
            InputSource inputSource = new InputSource(new InputStreamReader(new FileInputStream(file)));
            reader.parse(inputSource);
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
}

Output:

single call characters ->
    <-
single call characters ->190<-
value of tagA ->
    190<-
single call characters ->
    <-
single call characters ->  :  <-
value of tagB ->
      :  <-
single call characters ->
    <-
single call characters ->2019-07-02<-
value of tagc ->
    2019-07-02<-
single call characters ->
<-
value of main ->
<-

like you see, for single tag there are more of one call of method characters, each containing some text ( "\n" ot ohers ), and the final content of tag is concatenation of all.

  1. if i trim the final data, tagB fail the expect
  2. if i don't trim the final data, tagA and tagB fail the expect

Furthermore I tried to change textNode with List where add every characters call and on endelement i get the last... BUT... in a more big file i have some case with the content split in more characters call ( view image ), so i can get only the last element

enter image description here

last idea is linearize all file before parse, but i have to work with very big file, i no have idea how make this safe

CodePudding user response:

There is really no problem with your requirement to preserve the whitespace in your data:

import java.io.*;
import javax.xml.parsers.*;
import org.xml.sax.helpers.*;
import org.xml.sax.*;

public class FurmaxXmlHandler extends DefaultHandler {

    private boolean isTextNode = false;
    private boolean collect = false;
    private StringBuilder textNode = new StringBuilder();

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (collect) {
            textNode.append(ch, start, length);
        }
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes a) throws SAXException {
        if ("tagB".equals(qName)) {
            collect = true;
        }
    }

    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
        // Delegate to active handlers and deletes them if they are finished...
        if ("tagB".equals(name)) {
            collect = false;
            System.out.printf("tagB = '%s'%n", textNode.toString());
            textNode.setLength(0);
        }

    }

    public static void main(String[] args) {
        try {
            File file = new File("test.xml");
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser saxParser = factory.newSAXParser();
            XMLReader reader = saxParser.getXMLReader();
            FurmaxXmlHandler handler = new FurmaxXmlHandler();
            reader.setContentHandler(handler);
            InputSource inputSource = new InputSource(new InputStreamReader(new FileInputStream(file)));
            reader.parse(inputSource);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

CodePudding user response:

i tested with

<?xml version="1.0" encoding="UTF-8"?>
<main>
    <tagA>190</tagA>
    <tagB>  :  </tagB>
    <tagc>2019-07-02</tagc>
    <tagc>2018-07-02</tagc>
    <fatherA>
    <tagd attribute1=" ciao ">2018-07-02</tagd>
    </fatherA>
    <tagf></tagf>
</main>

with

public class FurmaxXmlHandler2 extends DefaultHandler {

private StringBuilder textNode = new StringBuilder();
private String currentElement = "";
private String workElement = "";

@Override
public void characters(char[] ch, int start, int length) throws SAXException {
    if (workElement.equals(currentElement)) {
        textNode.append(ch, start, length);
    } else {
        workElement = currentElement;
        textNode = new StringBuilder();
        textNode.append(ch, start, length);
    }
}

@Override
public void startElement(String uri, String localName, String qName, Attributes a) throws SAXException {
    currentElement = qName;
}

@Override
public void endElement(String uri, String localName, String name) throws SAXException {
    // Delegate to active handlers and deletes them if they are finished...
    String data = "";
    if (currentElement.equals(workElement)) {
        data = textNode.toString();
    }
    currentElement = "";
    workElement = "";
    System.out.printf("%s = '%s'%n", name, data);
    textNode.setLength(0);
}

public static void main(String[] args) {
    try {
        File file = new File("test.xml");
        SAXParserFactory factory = SAXParserFactory.newInstance();
        SAXParser saxParser = factory.newSAXParser();
        XMLReader reader = saxParser.getXMLReader();
        FurmaxXmlHandler2 handler = new FurmaxXmlHandler2();
        reader.setContentHandler(handler);
        InputSource inputSource = new InputSource(new InputStreamReader(new FileInputStream(file)));
        reader.parse(inputSource);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

}

result:

tagA = '190'
tagB = '  :  '
tagc = '2019-07-02'
tagc = '2018-07-02'
tagd = '2018-07-02'
fatherA = '
    '
tagf = ''
main = '
'

what do you think ?

  • Related