apache · noureldin-eg · Feb 9, 2023
diff --git a/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java b/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
@@ -1928,8 +1928,17 @@ private String normalizeWord(String word)
                 else
                 {
                     // Trim because some decompositions have an extra space, such as U+FC5E
-                    builder.append(Normalizer
-                            .normalize(word.substring(q, q + 1), Normalizer.Form.NFKC).trim());
+                    String normalized = Normalizer.normalize(
+                            word.substring(q, q + 1), Normalizer.Form.NFKC).trim();
+
+                    // Arabic Presentation Forms-A from FB50 to FDFF and
+                    // Arabic Presentation Forms-B from FE70 to FEFF
+                    if (0xFB50 <= c && normalized.length() > 1)
+                    {
+                        // Reverse the order of decomposed Arabic letters
+                        normalized = new StringBuilder(normalized).reverse().toString();
+                    }
+                    builder.append(normalized);
                 }
                 p = q + 1;
             }
@@ -1963,7 +1972,7 @@ private StringBuilder normalizeAdd(List<WordWithTextPositions> normalized,
         else
         {
             TextPosition text = item.getTextPosition();
-            lineBuilder.append(text.getUnicode());
+            lineBuilder.append(text.getVisuallyOrderedUnicode());
             wordPositions.add(text);
         }
         return lineBuilder;

diff --git a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java
@@ -170,6 +170,35 @@ public String getUnicode()
         return unicode;
     }
 
+    /**
+     * Same as {@link #getUnicode()} except that returned text is ensured to be
+     * visually ordered (i.e. same order you would see them displayed on screen when
+     * looking from left to right). This is important for Arabic where several
+     * unicode characters can be composed in one glyph with logical order (the order
+     * in which it would be normally typed from right to left).
+     * 
+     * @return The string on the screen in visual order.
+     */
+    public String getVisuallyOrderedUnicode()
+    {
+        final String text = getUnicode();
+        final int length = text.length();
+        for (int index = 0, nextIndex; index < length; index = nextIndex)
+        {
+            int codePoint = text.codePointAt(index);
+            nextIndex = index + Character.charCount(codePoint);
+            byte directionality = Character.getDirectionality(codePoint);
+            if (directionality == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
+                    // Even if the directionality is right to left, still there is no need to
+                    // reverse a single code-point
+                    && (index != 0 || nextIndex < length))
+            {
+                return new StringBuilder(text).reverse().toString();
+            }
+        }
+        return text;
+    }
+
     /**
      * Return the internal PDF character codes of the glyphs in this text.
      *