Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PDFBOX-4531: Reverse the order of decomposed Arabic letters #154

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Original file line number Diff line number Diff line change
Expand Up @@ -1928,8 +1928,17 @@ private String normalizeWord(String word)
else
{
// Trim because some decompositions have an extra space, such as U+FC5E
builder.append(Normalizer
.normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
String normalized = Normalizer.normalize(
word.substring(q, q + 1), Normalizer.Form.NFKC).trim();

// Arabic Presentation Forms-A from FB50 to FDFF and
// Arabic Presentation Forms-B from FE70 to FEFF
if (0xFB50 <= c && normalized.length() > 1)
{
// Reverse the order of decomposed Arabic letters
normalized = new StringBuilder(normalized).reverse().toString();
}
builder.append(normalized);
}
p = q + 1;
}
Expand Down Expand Up @@ -1963,7 +1972,7 @@ private StringBuilder normalizeAdd(List<WordWithTextPositions> normalized,
else
{
TextPosition text = item.getTextPosition();
lineBuilder.append(text.getUnicode());
lineBuilder.append(text.getVisuallyOrderedUnicode());
wordPositions.add(text);
}
return lineBuilder;
Expand Down
29 changes: 29 additions & 0 deletions pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,35 @@ public String getUnicode()
return unicode;
}

/**
* Same as {@link #getUnicode()} except that returned text is ensured to be
* visually ordered (i.e. same order you would see them displayed on screen when
* looking from left to right). This is important for Arabic where several
* unicode characters can be composed in one glyph with logical order (the order
* in which it would be normally typed from right to left).
*
* @return The string on the screen in visual order.
*/
public String getVisuallyOrderedUnicode()
{
final String text = getUnicode();
final int length = text.length();
for (int index = 0, nextIndex; index < length; index = nextIndex)
{
int codePoint = text.codePointAt(index);
nextIndex = index + Character.charCount(codePoint);
byte directionality = Character.getDirectionality(codePoint);
if (directionality == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
// Even if the directionality is right to left, still there is no need to
// reverse a single code-point
&& (index != 0 || nextIndex < length))
{
return new StringBuilder(text).reverse().toString();
}
}
return text;
}

/**
* Return the internal PDF character codes of the glyphs in this text.
*
Expand Down