I'm trying to replace special characters(umlauts characters) in xml using StAX.
I'm able to achieve this when input
and output
xml files are different. But the processing time for a 100MB file is ~10mins. I think it's due to the IO operations that takes time to write to a new file
line by line.
Is it possible to read and write the replaced string in same xml file through StAX, which would save the IO operation?
Any help/clue would be be much helpful for me.
public class StAXXMLFileDemo {
static XMLEventFactory m_eventFactory = XMLEventFactory.newInstance();
static String[] searchList = { "Ä", "ä", "Ö", "ö", "Ü", "ü", "ß" };
static String[] replacementList = { "Ae", "ae", "Oe", "oe", "Ue", "ue", "ss" };
public static void main(String[] args) {
if(args.length == 2)
{
File inputfileDirectory = new File(args[0]);
File outputfileDirectory = new File(args[1]);
try {
FileUtils.cleanDirectory(outputfileDirectory);
} catch (IOException e1) {
e1.printStackTrace();
System.out.println("Exception in deleting output directory files");
}
if (inputfileDirectory.isDirectory() && outputfileDirectory.isDirectory())
{
File[] files = inputfileDirectory.listFiles();
for (File file : files) {
if ( (file.isDirectory() == false) && (file.getAbsolutePath().endsWith(".xml")) ){
try {
String outputFileName = file.getName();
String absoluteFilePath = outputfileDirectory+File.separator+outputFileName;
//StAXXMLFileDemo ms = new StAXXMLFileDemo();
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLOutputFactory outFactory = XMLOutputFactory.newInstance();
//XMLEventFactory xmlEventFactory = XMLEventFactory.newInstance();
InputStream is = null;
try {
is = new FileInputStream(file);
} catch (FileNotFoundException e1) {
e1.printStackTrace();
}
XMLEventReader eventReader =
factory.createXMLEventReader(is,"UTF-8");
OutputStream outputStream = new FileOutputStream(absoluteFilePath);
XMLEventWriter eventWriter =
outFactory.createXMLEventWriter(
outputStream,"UTF-8");
//XMLStreamWriter writer = outFactory.createXMLStreamWriter(outputStream,"UTF-8");
// writer.writeStartDocument();
// StartDocument startDocument = xmlEventFactory.createStartDocument("UTF-8", "1.0", false);
//eventWriter.add(startDocument);
while(eventReader.hasNext()){
XMLEvent event = eventReader.nextEvent();
// eventWriter.add(event);
switch(event.getEventType()){
case XMLStreamConstants.SPACE:
eventWriter.add(event);
break;
case XMLStreamConstants.NAMESPACE:
eventWriter.add(event);
break;
case XMLStreamConstants.ATTRIBUTE:
eventWriter.add(event);
break;
case XMLStreamConstants.CDATA:
eventWriter.add(event);
break;
case XMLStreamConstants.NOTATION_DECLARATION:
eventWriter.add(event);
break;
case XMLStreamConstants.PROCESSING_INSTRUCTION:
eventWriter.add(event);
break;
case XMLStreamConstants.START_DOCUMENT:
eventWriter.add(event);
break;
case XMLStreamConstants.START_ELEMENT:
eventWriter.add(event);
break;
case XMLStreamConstants.CHARACTERS:
//String replaceString = event.toString();
// String replaced = StringUtils.replaceEachRepeatedly(event.toString(), searchList, replacementList);
//eventWriter.add(ms.getNewCharactersEvent(event.asCharacters()));
eventWriter.add(m_eventFactory.createCharacters(StringUtils.replaceEachRepeatedly(event.toString(), searchList, replacementList)));
break;
case XMLStreamConstants.END_ELEMENT:
eventWriter.add(event);
break;
}
}
eventWriter.flush();
eventWriter.close();
try {
outputStream.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (XMLStreamException e) {
e.printStackTrace();
}
}
}
}
}
}