JSoup Document.toString() does not generate correct XML-Output (original) (raw)

We tried to configure JSoup as an XML parser and unparser. However JSoup does not seem to generate a valid output from an XML containing the escaped entity 

Find below a demonstration of the problem.

package at.ac.uibk.jsoup.tests;

import java.io.IOException; import java.io.StringReader; import java.io.StringWriter;

import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult;

import org.jsoup.Jsoup; import org.jsoup.parser.Parser; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException;

public class JSoupTest {

final static String originalXML = "<?xml version=\"1.1\" encoding=\"UTF-8\"?>\r\n"
        + "<SomeText>This is an escaped escape-character: &#x1b;</SomeText>";

public static void main(String[] args)
        throws SAXException, IOException, ParserConfigurationException, TransformerException {

    parseXMLWithJSoup();

    parseXMLInternal();

}

private static void parseXMLWithJSoup() {
    System.out.println();
    System.out.println("------------------- incorrect unparsing with JSOUP ------------------- ");
    System.out.println();
    System.out.println("original XML with escaped escape-character:\n  " + originalXML);
    org.jsoup.nodes.Document document = Jsoup.parse(originalXML, "", Parser.xmlParser());
    document.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml).indentAmount(2)
            .prettyPrint(true);

    String returnedXMLFromJSoupParser = document.toString();
    System.out.println();
    System.out.println("returned XMLFromJSoupParser No escaped escape character: \n  " + returnedXMLFromJSoupParser);

    org.jsoup.nodes.Document document2 = Jsoup.parse(returnedXMLFromJSoupParser, "", Parser.xmlParser());
    document2.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml).indentAmount(2)
            .prettyPrint(true);

    String returned2ndXMLFromJSoupParser = document.toString();
    System.out.println();
    System.out.println("returned reparsed result XMLFromJSoupParser: " + returned2ndXMLFromJSoupParser);
}

public static void parseXMLInternal()
        throws SAXException, IOException, ParserConfigurationException, TransformerException {
    System.out.println();
    System.out.println("----------------------- correct unparsing ---------------------- ");
    System.out.println();
    System.out.println("original XML with escaped escape-character:\n  " + originalXML);
    DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();

    InputSource input = new InputSource(new StringReader(originalXML));
    Document doc = builder.parse(input);

    TransformerFactory tf = TransformerFactory.newInstance();
    Transformer trans = tf.newTransformer();
    StringWriter sw = new StringWriter();
    trans.transform(new DOMSource(doc), new StreamResult(sw));

    String returnedXMLFromSaxParser = sw.toString();

    System.out.println();
    System.out.println("returned XML From SAX Parser/Transformer: \n" + returnedXMLFromSaxParser);

}

}

Best regards
Michael