Our department has inherited code that uses Apache PDFBox 1.8.x or earlier and we are in the process of trying to migrate it to Apache PDFBox 2.0.x. There are parts of the code that use the TextNormalize, but I can not find any mention of it in the 2.0 javadocs. I also don't find any mention of it in the Migration to PDFBox 2.0.0 guide.
I can't seem to find any information on how this class is changed or a replacement for TextNormalize. Does anyone have any suggestions on how this should be replaced for Apache PDFBox 2.0?
Largely, we create a TextNormalize object as part of the constructor for our class that extends PDFStreamEngine and the only place it is used in the code that merges/inserts diacritics.
/**
* Merge a single character TextPosition into the current object.
* This is to be used only for cases where we have a diacritic that
* overlaps an existing TextPosition. In a graphical display, we could
* overlay them, but for text extraction we need to merge them. Use the
* contains() method to test if two objects overlap.
*
* @param diacritic TextPosition to merge into the current TextPosition.
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
public void mergeDiacritic(TextPosition diacritic, TextNormalize normalize)
{
if (diacritic.getCharacter().length() > 1)
{
return;
}
float diacXStart = diacritic.getXDirAdj();
float diacXEnd = diacXStart + diacritic.widths[0];
float currCharXStart = getXDirAdj();
int strLen = str.length();
boolean wasAdded = false;
for (int i = 0; i < strLen && !wasAdded; i++)
{
float currCharXEnd = currCharXStart + widths[i];
/*
* This is the case where there is an overlap of the diacritic character with
* the current character and the previous character. If no previous character,
* just append the diacritic after the current one.
*/
if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
{
if(i == 0)
{
insertDiacritic(i, diacritic, normalize);
}
else
{
float distanceOverlapping1 = diacXEnd - currCharXStart;
float percentage1 = distanceOverlapping1/widths[i];
float distanceOverlapping2 = currCharXStart - diacXStart;
float percentage2 = distanceOverlapping2/widths[i-1];
if(percentage1 >= percentage2)
{
insertDiacritic(i, diacritic, normalize);
}
else
{
insertDiacritic(i-1, diacritic, normalize);
}
}
wasAdded = true;
}
//diacritic completely covers this character and therefore we assume that
//this is the character the diacritic belongs to
else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd)
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
//Otherwise, The diacritic modifies this character because its completely
//contained by the character width
else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd)
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Last character in the TextPosition so we add diacritic to the end
*/
else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1))
{
insertDiacritic(i, diacritic, normalize);
wasAdded = true;
}
/*
* Couldn't find anything useful so we go to the next character in the
* TextPosition
*/
currCharXStart += widths[i];
}
}
and
/**
* Inserts the diacritic TextPosition to the str of this TextPosition
* and updates the widths array to include the extra character width.
* @param i current character
* @param diacritic The diacritic TextPosition
* @param normalize Instance of TextNormalize class to be used to normalize diacritic
*/
private void insertDiacritic(int i, TextPosition diacritic, TextNormalize normalize)
{
/* we add the diacritic to the right or left of the character
* depending on the direction of the character. Note that this
* is only required because the text is currently stored in
* presentation order and not in logical order.
*/
int dir = Character.getDirectionality(str.charAt(i));
StringBuffer buf = new StringBuffer();
buf.append(str.substring(0,i));
float[] widths2 = new float[widths.length+1];
System.arraycopy(widths, 0, widths2, 0, i);
if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
|| (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE))
{
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i] = 0;
buf.append(str.charAt(i));
widths2[i+1] = widths[i];
}
else
{
buf.append(str.charAt(i));
widths2[i] = widths[i];
buf.append(normalize.normalizeDiac(diacritic.getCharacter()));
widths2[i+1] = 0;
}
// Get the rest of the string
buf.append(str.substring(i+1, str.length()));
System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
str = buf.toString();
widths = widths2;
}