-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
40e4d56
commit f3fde36
Showing
4 changed files
with
66 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
72 changes: 52 additions & 20 deletions
72
OsmAnd-java/src/main/java/net/osmand/util/ArabicNormalizer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,67 @@ | ||
package net.osmand.util; | ||
|
||
import java.text.Normalizer; | ||
import java.util.regex.Pattern; | ||
|
||
public class ArabicNormalizer { | ||
|
||
private static final Pattern DIACRITICS_PATTERN = Pattern.compile("\\p{Mn}"); | ||
private static final String DIACRITIC_REGEX = "[\\u064B-\\u0652]"; | ||
private static final String ARABIC_DIGITS = "٠١٢٣٤٥٦٧٨٩"; | ||
private static final String DIGITS_REPLACEMENT = "0123456789"; | ||
private static final String KASHIDA = "\u0640"; | ||
|
||
public static boolean isSpecialArabic(String text) { | ||
if (text == null || text.isEmpty()) { | ||
return false; | ||
} | ||
char first = text.charAt(0); | ||
if (Character.UnicodeBlock.of(first) == Character.UnicodeBlock.ARABIC) { | ||
for (char c : text.toCharArray()) { | ||
if (isDiacritic(c) || isArabicDigit(c) || isKashida(c)) { | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static String normalize(String text) { | ||
if (text == null || text.isEmpty()) { | ||
return text; | ||
} | ||
String result = text.replaceAll(DIACRITIC_REGEX, ""); | ||
result = result.replace(KASHIDA, ""); | ||
return replaceDigits(result); | ||
} | ||
|
||
private static String replaceDigits(String text) { | ||
if (text == null) { | ||
return null; // Handle null input | ||
} | ||
char first = text.charAt(0); | ||
if (Character.UnicodeBlock.of(first) != Character.UnicodeBlock.ARABIC) { | ||
return text; | ||
} | ||
|
||
String normalized = Normalizer.normalize(text, Normalizer.Form.NFD); | ||
normalized = DIACRITICS_PATTERN.matcher(normalized).replaceAll(""); // Remove diacritics efficiently | ||
|
||
// Hamza variations | ||
normalized = normalized.replace("إ", "ا"); // Initial hamza on alif | ||
normalized = normalized.replace("أ", "ا"); // Initial hamza on waw | ||
normalized = normalized.replace("ئ", "ي"); // Hamza on ya' (This should be 'ي' not 'ا' for better accuracy) | ||
normalized = normalized.replace("ؤ", "و"); // Hamza on waw | ||
|
||
// Other normalizations | ||
normalized = normalized.replace("آ", "ا"); // Alif madda | ||
normalized = normalized.replace("ى", "ي"); // Final form of ya' | ||
normalized = normalized.replace("ة", "ه"); // Teh marbuta to ha' | ||
char[] textChars = text.toCharArray(); | ||
for (int i = 0; i < ARABIC_DIGITS.length(); i++) { | ||
char c = ARABIC_DIGITS.charAt(i); | ||
char replacement = DIGITS_REPLACEMENT.charAt(i); | ||
int index = text.indexOf(c); | ||
while (index >= 0) { | ||
textChars[index] = replacement; | ||
index = text.indexOf(c, index + 1); | ||
} | ||
} | ||
return String.valueOf(textChars); | ||
} | ||
|
||
// Kashida | ||
normalized = normalized.trim().replaceAll("\u0640", "");// Kashida | ||
private static boolean isDiacritic(char c) { | ||
return c >= '\u064B' && c <= '\u0652'; // Diacritic range | ||
} | ||
|
||
return normalized; | ||
private static boolean isArabicDigit(char c) { | ||
return c >= '\u0660' && c <= '\u0669'; // Arabic-Indic digits ٠-٩ | ||
} | ||
|
||
private static boolean isKashida(char c) { | ||
return c == '\u0640'; // Kashida character | ||
} | ||
} |