11 March 2009

Escape illegal characters with JAXB XML serialization

The XML 1.0 specification says that some characters are illegal in XML (http://www.w3.org/TR/REC-xml/).

When performing my JAXB marshal I had an ASCII control character in my Java object. This character was written into the XML file and everything looked okay... Until I tried to make an XSLT transformation. My transformer engine could not transform the XML because of the character.

I searched the internet and found this thread: http://www.nabble.com/Escaping-illegal-characters-during-marshalling-td20090044.html. This code can escape special characters. I've made a small modification to the code so that is worked better. I changed it so it work with only JDK API and added the UTF-8 parameter so it also can handle latin-1 characters regardless on which locale the Java VM is executing.

There is one drawback with this: You don't get a nice indented code. All the marshaller.setProperty(...) will probably not work anymore.

I'm a bit disappointed with the Sun implementation of the JAXB marshaller that it cannot handle this problem.

import java.util.HashSet;

import javax.xml.namespace.NamespaceContext;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

/**
* Delegating {@link XMLStreamWriter} that filters out UTF-8 characters that
* are illegal in XML.
*
* @author Erik van Zijst (small change by Lennart Schedin)
*/
public class EscapingXMLStreamWriter implements XMLStreamWriter {

private final XMLStreamWriter writer;
public static final char substitute = '\uFFFD';
private static final HashSet<Character> illegalChars;

static {
final String escapeString = "\u0000\u0001\u0002\u0003\u0004\u0005" +
"\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012" +
"\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C" +
"\u001D\u001E\u001F\uFFFE\uFFFF";

illegalChars = new HashSet<Character>();
for (int i = 0; i < escapeString.length(); i++) {
illegalChars.add(escapeString.charAt(i));
}
}

public EscapingXMLStreamWriter(XMLStreamWriter writer) {

if (null == writer) {
throw new IllegalArgumentException("null");
} else {
this.writer = writer;
}
}

private boolean isIllegal(char c) {
return illegalChars.contains(c);
}

/**
* Substitutes all illegal characters in the given string by the value of
* {@link EscapingXMLStreamWriter#substitute}. If no illegal characters
* were found, no copy is made and the given string is returned.
*
* @param string
* @return
*/
private String escapeCharacters(String string) {

char[] copy = null;
boolean copied = false;
for (int i = 0; i < string.length(); i++) {
if (isIllegal(string.charAt(i))) {
if (!copied) {
copy = string.toCharArray();
copied = true;
}
copy[i] = substitute;
}
}
return copied ? new String(copy) : string;
}

public void writeStartElement(String s) throws XMLStreamException {
writer.writeStartElement(s);
}

public void writeStartElement(String s, String s1) throws XMLStreamException {
writer.writeStartElement(s, s1);
}

public void writeStartElement(String s, String s1, String s2)
throws XMLStreamException {
writer.writeStartElement(s, s1, s2);
}

public void writeEmptyElement(String s, String s1) throws XMLStreamException {
writer.writeEmptyElement(s, s1);
}

public void writeEmptyElement(String s, String s1, String s2)
throws XMLStreamException {
writer.writeEmptyElement(s, s1, s2);
}

public void writeEmptyElement(String s) throws XMLStreamException {
writer.writeEmptyElement(s);
}

public void writeEndElement() throws XMLStreamException {
writer.writeEndElement();
}

public void writeEndDocument() throws XMLStreamException {
writer.writeEndDocument();
}

public void close() throws XMLStreamException {
writer.close();
}

public void flush() throws XMLStreamException {
writer.flush();
}

public void writeAttribute(String localName, String value) throws XMLStreamException {
writer.writeAttribute(localName, escapeCharacters(value));
}

public void writeAttribute(String prefix, String namespaceUri, String localName, String value)
throws XMLStreamException {
writer.writeAttribute(prefix, namespaceUri, localName, escapeCharacters(value));
}

public void writeAttribute(String namespaceUri, String localName, String value)
throws XMLStreamException {
writer.writeAttribute(namespaceUri, localName, escapeCharacters(value));
}

public void writeNamespace(String s, String s1) throws XMLStreamException {
writer.writeNamespace(s, s1);
}

public void writeDefaultNamespace(String s) throws XMLStreamException {
writer.writeDefaultNamespace(s);
}

public void writeComment(String s) throws XMLStreamException {
writer.writeComment(s);
}

public void writeProcessingInstruction(String s) throws XMLStreamException {
writer.writeProcessingInstruction(s);
}

public void writeProcessingInstruction(String s, String s1)
throws XMLStreamException {
writer.writeProcessingInstruction(s, s1);
}

public void writeCData(String s) throws XMLStreamException {
writer.writeCData(escapeCharacters(s));
}

public void writeDTD(String s) throws XMLStreamException {
writer.writeDTD(s);
}

public void writeEntityRef(String s) throws XMLStreamException {
writer.writeEntityRef(s);
}

public void writeStartDocument() throws XMLStreamException {
writer.writeStartDocument();
}

public void writeStartDocument(String s) throws XMLStreamException {
writer.writeStartDocument(s);
}

public void writeStartDocument(String s, String s1)
throws XMLStreamException {
writer.writeStartDocument(s, s1);
}

public void writeCharacters(String s) throws XMLStreamException {
writer.writeCharacters(escapeCharacters(s));
}

public void writeCharacters(char[] chars, int start, int len)
throws XMLStreamException {
writer.writeCharacters(escapeCharacters(new String(chars, start, len)));
}

public String getPrefix(String s) throws XMLStreamException {
return writer.getPrefix(s);
}

public void setPrefix(String s, String s1) throws XMLStreamException {
writer.setPrefix(s, s1);
}

public void setDefaultNamespace(String s) throws XMLStreamException {
writer.setDefaultNamespace(s);
}

public void setNamespaceContext(NamespaceContext namespaceContext)
throws XMLStreamException {
writer.setNamespaceContext(namespaceContext);
}

public NamespaceContext getNamespaceContext() {
return writer.getNamespaceContext();
}

public Object getProperty(String s) throws IllegalArgumentException {
return writer.getProperty(s);
}
}


Here is the test class:
import static org.junit.Assert.*;

import java.io.ByteArrayOutputStream;
import java.nio.charset.Charset;

import javax.xml.bind.*;
import javax.xml.bind.annotation.*;
import javax.xml.stream.*;

import org.junit.Test;

/**
* Test class to escape special characters from XML
*
* @author Lennart Schedin
*/
public class EscapeJaxb {
@Test
public void testEvilXml() throws Exception {
//Store the serialized data in memory
ByteArrayOutputStream out = new ByteArrayOutputStream();

//Serialize the test XML class
JAXBContext jaxbContext = JAXBContext.newInstance(EvilXml.class);
Marshaller marshaller = jaxbContext.createMarshaller();
XMLStreamWriter xmlStreamWriter =
XMLOutputFactory.newInstance().createXMLStreamWriter(out, "UTF-8");
EscapingXMLStreamWriter filter = new EscapingXMLStreamWriter(xmlStreamWriter);
marshaller.marshal(new EvilXml(), filter);

assertEquals(59, out.size());

//Check that the latin-1 char is intact and the control char substituted
String expectedXmlString =
"<?xml version=\"1.0\" ?><evilXml>" +
new EvilXml().content +
"</evilXml>";
expectedXmlString = expectedXmlString.replace('\u0007',
EscapingXMLStreamWriter.substitute);
String xmlString = new String(out.toByteArray(), Charset.forName("UTF8"));
assertEquals(expectedXmlString, xmlString);
}

@XmlRootElement
public static class EvilXml {
@XmlValue
/* Illegal control ASCII character and a latin-1 A with a ring above */
private String content = "Hello World \u0007 \u00c5";
}
}

1 comment:

  1. Thanks - just what I was looking for, after coming across several misinformed and slightly useless posts on StackOverflow about escaping invalid characters simply saying 'use a library'. JAXB is a library and is blindly outputting characters in XML that shouldn't be there.

    ReplyDelete