Skip to content

Commit

Permalink
Add arabic normalizer for search
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanPyrohivskyi committed Jan 14, 2025
1 parent 6c6489e commit 40e4d56
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 3 deletions.
14 changes: 12 additions & 2 deletions OsmAnd-java/src/main/java/net/osmand/CollatorStringMatcher.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package net.osmand;

import net.osmand.util.ArabicNormalizer;

import java.util.Locale;


Expand Down Expand Up @@ -55,9 +57,17 @@ public Collator getCollator() {
public boolean matches(String name) {
return cmatches(collator, name, part, mode);
}



public static boolean cmatches(Collator collator, String fullName, String part, StringMatcherMode mode){
String withoutDiacritic = ArabicNormalizer.normalize(fullName);
boolean matchDiacritic = false;
if (!fullName.equals(withoutDiacritic)) {
matchDiacritic = cmatchInternal(collator, withoutDiacritic, part, mode);
}
return matchDiacritic || cmatchInternal(collator, fullName, part, mode);
}

private static boolean cmatchInternal(Collator collator, String fullName, String part, StringMatcherMode mode){
switch (mode) {
case CHECK_CONTAINS:
return ccontains(collator, fullName, part);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import net.osmand.data.QuadRect;
import net.osmand.osm.AbstractPoiType;
import net.osmand.util.Algorithms;
import net.osmand.util.ArabicNormalizer;
import net.osmand.util.LocationParser;
import net.osmand.util.MapUtils;

Expand Down Expand Up @@ -802,7 +803,12 @@ public boolean matches(Collection<String> map) {

@Override
public boolean matches(String name) {
return sm.matches(name);
String normalized = ArabicNormalizer.normalize(name);
boolean matchDiacritic = false;
if (!name.equals(normalized)) {
matchDiacritic = sm.matches(normalized);
}
return matchDiacritic || sm.matches(name);
}

}
Expand Down
35 changes: 35 additions & 0 deletions OsmAnd-java/src/main/java/net/osmand/util/ArabicNormalizer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package net.osmand.util;

import java.text.Normalizer;
import java.util.regex.Pattern;

public class ArabicNormalizer {

private static final Pattern DIACRITICS_PATTERN = Pattern.compile("\\p{Mn}");

public static String normalize(String text) {
if (text == null) {
return null; // Handle null input
}

String normalized = Normalizer.normalize(text, Normalizer.Form.NFD);
normalized = DIACRITICS_PATTERN.matcher(normalized).replaceAll(""); // Remove diacritics efficiently

// Hamza variations
normalized = normalized.replace("إ", "ا"); // Initial hamza on alif
normalized = normalized.replace("أ", "ا"); // Initial hamza on waw
normalized = normalized.replace("ئ", "ي"); // Hamza on ya' (This should be 'ي' not 'ا' for better accuracy)
normalized = normalized.replace("ؤ", "و"); // Hamza on waw

// Other normalizations
normalized = normalized.replace("آ", "ا"); // Alif madda
normalized = normalized.replace("ى", "ي"); // Final form of ya'
normalized = normalized.replace("ة", "ه"); // Teh marbuta to ha'

// Kashida
normalized = normalized.trim().replaceAll("\u0640", "");// Kashida

return normalized;
}

}

0 comments on commit 40e4d56

Please sign in to comment.