Notebook: Common XML Tasks

My notebooks contain code snippets that I use fairly regularly but not enough to remember off the top of my head.  Google is misleading since the oldest stuff often has the most links.

A quick review: the Document Object Model (DOM) is a parse tree of the entire document. Browsers see their HTML (and XML) content as DOM objects. Many operations are easiest in a DOM, but this must be weighed against the comparably heavy resources required by a DOM vs. a SAX parser.

A SAX (Simple API for XML) parser is a much lighter object than a DOM model, and is a lot more powerful than you might expect. But it requires thinking in streaming mode.

DOM methods:

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import org.w3c.dom.Document;

private static final DocumentBuilderFactory dbFactory;
private static final TransformerFactory tFactory;

// static initialization
// these objects are thread-safe.  The objects they produce are not.
static {
   try {
      dbFactory = DocumentBuilderFactory.newInstance();
      tFactory = TransformerFactory.newInstance();
   } catch (FactoryConfigurationError e) {
      // unable to get a document builder factory
      throw new ExceptionInInitializerError(e);
   }
}

/**
 * Create an empty DOM.
 */
public Document newDocument() throws ParserConfigurationException{
  DocumentBuilder builder = dbFactory.newDocumentBuilder();
  return builder.newDocument();
}

/**
 * Serialize a DOM.
 * <p>
 * You can do this "more efficiently" by walking the DOM yourself and
 * writing to a StringBuilder but that is usually not as robust
 * as using a standard DOM serializer.  For instance, how do you handle
 * namespaces?  UTF-16?  Knowing that you need to break a CDATA section
 * into multiple pieces because there's a "]]>" in the content?
 */
public String serialize(Document doc) throws TransformerConfigurationException, TransformerException {
   StringWriter out = new StringWriter();
   Transformer tf = tFactory.newTransformer();
   tf.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
   tf.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
   tf.setOutputProperty(OutputKeys.INDENT, "no");
   //tf.setOutputProperty(OutputKeys.CDATA_SECTION_ELEMENTS, "content");
   tf.transform(new DOMSource(d), new StreamResult(out));

   return out.toString();
}

SAX methods:

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.DefaultHandler2;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * Sample class that handles SAX callback methods.
 */
private static class MyHandler extends DefaultHandler2 {
   // parse a document
   // a SAXParseException gives line and column number.
   void parse(File file) throws SAXException, IOException {
   Reader r = null;
   try {
      r = new FileReader(file);
      Handler h = new MyHandler();
      XMLReader reader = XMLReaderFactory.createXMLReader();
      reader.setContentHandler(h);
      reader.parse(new InputSource(new XmlEntityReader(r)));
   } finally {
      if (r != null) {
         r.close();
      }
   }
}

Leave your Comment

Blue Taste Theme created by Jabox