Unicode breaks xml serialization (original) (raw)
The parsed html is clearly weird and broken, but my assumption is that the output, after re-serializing it, should be valid.
- There are unicode characters in tag names, which does not agree with
document.outputSettings().charset("ASCII");
Version: 1.13.1
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Entities; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.IOException; import java.io.StringReader;
public class Test2 { public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { Document document = Jsoup.parse(" <div id="emid"> <p\u226F\u0322\u0329\u032B\u0320\u0309\u030A\u0366\u0364\u036D\u030A..\u0337\u0359\u036F\u030A\u033D\u0313\u0346\u0309\u036B.\u0347\u032A\u0367\u0305\u0301>\n < p=""> \n </p\u226F\u0322\u0329\u032B\u0320\u0309\u030A\u0366\u0364\u036D\u030A..\u0337\u0359\u036F\u030A\u033D\u0313\u0346\u0309\u036B.\u0347\u032A\u0367\u0305\u0301><> \n "); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); document.outputSettings().escapeMode(Entities.EscapeMode.xhtml); document.outputSettings().prettyPrint(true); document.outputSettings().charset("ASCII"); String html = document.html();
System.out.println(html);
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(new InputSource(new StringReader(html)), new DefaultHandler() {
@Override
public void warning(SAXParseException e) throws SAXException {
e.printStackTrace();
}
@Override
public void error(SAXParseException e) throws SAXException {
e.printStackTrace();
}
@Override
public void fatalError(SAXParseException e) throws SAXException {
e.printStackTrace();
}
});
}}
output:
<html>
<head></head>
<body>
<div id="emid"> <p≯...>
< p="">
</p≯...><>
</div>
</body>
</html>
org.xml.sax.SAXParseException; lineNumber: 4; columnNumber: 21; Element type "p" must be followed by either attribute specifications, ">" or "/>".
at org.apache.xerces.util.ErrorHandlerWrapper.createSAXParseException(Unknown Source)
at org.apache.xerces.util.ErrorHandlerWrapper.fatalError(Unknown Source)
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
at org.apache.xerces.impl.XMLScanner.reportFatalError(Unknown Source)
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanStartElement(Unknown Source)
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown Source)
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl.parse(Unknown Source)
at Test2.main(Test2.java:31)
Exception in thread "main" org.xml.sax.SAXParseException; lineNumber: 4; columnNumber: 21; Element type "p" must be followed by either attribute specifications, ">" or "/>".
at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl.parse(Unknown Source)
at Test2.main(Test2.java:31)
Process finished with exit code 1