I'd like to split a big XML file into many parts using StAX technology, based on a split-basis node.
The problem is when split-basis nodes are juxtaposed (no whitespace, no tab and no breakline between them). Those seems not be read by the parser while transform instruction. However when I put in comment transform instruction, those nodes are correctly read and outputted into the console.
Bellow XML sample. Split-basis is AB node.
< ?xml version="1.0" encoding="UTF-8"?>
< root>
< AB Id="1">< BC attB="valB1">b1< /BC>< CD attC="valC1">< EF attE="valD1">c1< /EF>< /CD>< /AB>< AB Id="2">< BC attB="valB2">b2< /BC>< CD attC="valC2">< EF attE="valD2">c2< /EF>< /CD>< /AB>
< AB Id="3">
< BC attB="valB3">b3< /BC>
< CD attC="valC3">
< EF attE="valD3">c3< /EF>
< /CD>
< /AB>
< /root>
The expected output should be 3 files named Part_1.xml, Part_2.xml and Part_3.xml. Each file should respectively contain < AB Id="1"> and its sub-tags, < AB Id="2"> and its sub-tags and < AB Id="3"> and its sub-tags. All of them should also have < root > node parent.
Unfortunately, I only obtain Part_1.xml and Part_2.xml files. Inside Part_1.xml I get < AB id = "1"> and its sub-tags. But inside Part_2.xml, I get < AB id = "3"> and its sub-tags instead of < AB id = "2"> and its sub-tags. < AB id = "2"> and its sub-tags are not written.
When I put in comment transformer.transform(staxs, staxr); line. StreamReader reads correclty < AB id = "2">.
code:
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.XMLEvent;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stax.StAXResult;
import javax.xml.transform.stax.StAXSource;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
@Component
final class Split {
private static final Logger LOG = LoggerFactory.getLogger(Split.class);
private static final String inputXmlFile = "Big xml file to split. Above a dummy sample";
private static final String pathToOutputFolder = System.getProperty("java.io.tmpdir");
private static final String[] parentTags = new String[] {"<root>"};
private static final String splitTag = "<AB>";
private static final String chunkNumber = "1";
public void run() {
final Transformer transformer;
XMLStreamReader xsr = null;
XMLStreamWriter xsw = null;
try {
transformer = TransformerFactory.newInstance().newTransformer();
final XMLInputFactory xif = XMLInputFactory.newInstance();
xsr = xif.createXMLStreamReader(new FileInputStream(inputXmlFile));
Short fileNumber = 0;
Short dataRepetitions = 0;
xsw = write(pathToOutputFolder, ++fileNumber, parentTags);
int tagCount = 0;
while (xsr.hasNext()) {
xsr.next();
if (xsr.getEventType() == XMLEvent.START_ELEMENT) {
tagCount++;
System.out.println("Tag _" + tagCount + ": " + xsr.getLocalName());
if (xsr.getLocalName().equals(splitTag)) {
System.out.println(xsr.getLocalName() + ": [" + xsr.getAttributeLocalName(0) + ", " + xsr.getAttributeValue(0) + "]");
if (dataRepetitions.equals(1)) {
xsw.flush();
xsw.writeEndDocument();
xsw.close();
xsw = write(pathToOutputFolder, ++fileNumber, parentTags);
dataRepetitions = 0;
}
final StAXSource staxs = new StAXSource(xsr);
final StAXResult staxr = new StAXResult(xsw);
transformer.transform(staxs, staxr);
dataRepetitions++;
}
}
}
} catch (final TransformerException | FileNotFoundException | XMLStreamException e) {
throw new SplitXmlRuntimeException(e.getMessage());
} finally {
try {
xsr.close();
if (xsw != null) {
xsw.flush();
xsw.writeEndDocument();
xsw.close();
}
} catch (final XMLStreamException e) {
LOG.error(e.getMessage());
}
}
}
private XMLStreamWriter write(final String pathToOutputFolder, final Short fileNumber, final String[] rootTags) throws XMLStreamException, FileNotFoundException {
XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
XMLStreamWriter writer = xmlOutputFactory.createXMLStreamWriter(new FileOutputStream(new File(pathToOutputFolder, "Part_" + fileNumber), true));
writer.writeStartDocument();
for (final String s : rootTags) {
writer.writeStartElement(s);
}
return writer;
}
}