JSoup Document.toString() does not generate correct XML-Output (original) (raw)
We tried to configure JSoup as an XML parser and unparser. However JSoup does not seem to generate a valid output from an XML containing the escaped entity
Find below a demonstration of the problem.
package at.ac.uibk.jsoup.tests;
import java.io.IOException; import java.io.StringReader; import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult;
import org.jsoup.Jsoup; import org.jsoup.parser.Parser; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException;
public class JSoupTest {
final static String originalXML = "<?xml version=\"1.1\" encoding=\"UTF-8\"?>\r\n"
+ "<SomeText>This is an escaped escape-character: </SomeText>";
public static void main(String[] args)
throws SAXException, IOException, ParserConfigurationException, TransformerException {
parseXMLWithJSoup();
parseXMLInternal();
}
private static void parseXMLWithJSoup() {
System.out.println();
System.out.println("------------------- incorrect unparsing with JSOUP ------------------- ");
System.out.println();
System.out.println("original XML with escaped escape-character:\n " + originalXML);
org.jsoup.nodes.Document document = Jsoup.parse(originalXML, "", Parser.xmlParser());
document.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml).indentAmount(2)
.prettyPrint(true);
String returnedXMLFromJSoupParser = document.toString();
System.out.println();
System.out.println("returned XMLFromJSoupParser No escaped escape character: \n " + returnedXMLFromJSoupParser);
org.jsoup.nodes.Document document2 = Jsoup.parse(returnedXMLFromJSoupParser, "", Parser.xmlParser());
document2.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml).indentAmount(2)
.prettyPrint(true);
String returned2ndXMLFromJSoupParser = document.toString();
System.out.println();
System.out.println("returned reparsed result XMLFromJSoupParser: " + returned2ndXMLFromJSoupParser);
}
public static void parseXMLInternal()
throws SAXException, IOException, ParserConfigurationException, TransformerException {
System.out.println();
System.out.println("----------------------- correct unparsing ---------------------- ");
System.out.println();
System.out.println("original XML with escaped escape-character:\n " + originalXML);
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
InputSource input = new InputSource(new StringReader(originalXML));
Document doc = builder.parse(input);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer trans = tf.newTransformer();
StringWriter sw = new StringWriter();
trans.transform(new DOMSource(doc), new StreamResult(sw));
String returnedXMLFromSaxParser = sw.toString();
System.out.println();
System.out.println("returned XML From SAX Parser/Transformer: \n" + returnedXMLFromSaxParser);
}}
Best regards
Michael