Skip to content

Commit

Permalink
Really tramping down on miscellaneous codes, etc. noise. mostly from …
Browse files Browse the repository at this point in the history
…POSTAL.
mubaldino committed May 13, 2024
1 parent 448deab commit 66a37d9
Showing 8 changed files with 63 additions and 29 deletions.
17 changes: 16 additions & 1 deletion src/main/java/org/opensextant/extractors/geo/PlaceCandidate.java
Original file line number Diff line number Diff line change
@@ -177,6 +177,17 @@ public Collection<PlaceCandidate> getRelated() {
return related.values();
}

public boolean hasResolvedRelated() {
if (related != null) {
for (PlaceCandidate geo : related.values()) {
if (geo.getChosenPlace() != null) {
return true;
}
}
}
return false;
}

// ---- the getters and setters ---------
//

@@ -928,6 +939,10 @@ public void linkGeography(String slot, Place geo) {
linkedGeography.put(slot, geo);
}

public boolean hasLinkedGeography() {
return linkedGeography != null && !linkedGeography.isEmpty();
}

public boolean hasLinkedGeography(String slot) {
if (linkedGeography == null) {
return false;
@@ -1016,7 +1031,7 @@ public boolean hasPostal() {
}
if (geo.getPlace().isPostal()) {
hasPostal = true;
return hasPostal;
return true;
}
}
return hasPostal;
Original file line number Diff line number Diff line change
@@ -37,7 +37,7 @@ public void evaluate(List<PlaceCandidate> names) {

for (PlaceCandidate name : names) {
// We do not want mixed case acronym/code/abbreviation matches.
if (name.isCountry){
if (name.isCountry) {
if (!name.isUpper() && name.getLength() < 4 && !(name.hasCJKText() || name.hasMiddleEasternText())) {
// Just looking at country codes -- we'll only consider upper case codes if they are short.
name.setFilteredOut(true); /* TODO: possibly leave as filtered-in */
Original file line number Diff line number Diff line change
@@ -49,7 +49,7 @@ public class MajorPlaceRule extends GeocodeRule {
public static final String ADMIN = "MajorPlace.Admin";
public static final String POP = "MajorPlace.Population";
public static final String MENTIONED_COUNTRY = "MajorPlace.InCountry";
private Map<String, Integer> popStats;
private final Map<String, Integer> popStats;
private static final int GEOHASH_RESOLUTION = 5;
private static final int POP_MIN = 50000;

@@ -168,7 +168,7 @@ public void evaluate(final PlaceCandidate name, final Place geo) {
// IFF no countries are mentioned, Capitals are good proxies for country.
inferCountry(geo);
ev = new PlaceEvidence(geo, CAPITAL, weight + 2.0);
} else if (geo.isAdmin1()) {
} else if (geo.isAdmin1() && (!geo.isCode() || name.hasLinkedGeography())) {
ev = new PlaceEvidence(geo, ADMIN, weight);
inferBoundary(name.getNDTextnorm(), geo);
} else if (popStats != null && geo.isPopulated()) {
Original file line number Diff line number Diff line change
@@ -27,13 +27,12 @@
import org.opensextant.util.GeonamesUtility;
import static org.opensextant.extractors.geo.rules.RuleTool.hasOnlyDefaultRules;

// TODO: expand from pairs to 2-4 tuples of related geographic hierachy, e.g., City, State, Country, etc.

/**
* A rule that associates a CODE with a NAME, when the pattern
* "NAME, CODE" appears within N characters of each other.
* If CODE.adm1 == NAME.adm1 and CODE is an ADM1 boundary, then flag this is
* significant.
*
* TODO: expand from pairs to 2-4 tuples of related geographic hierachy, e.g., City, State, Country, etc.
* If CODE.adm1 == NAME.adm1 and CODE is an ADM1 boundary, then flag this is significant.
* @author ubaldino
*/
public class NameCodeRule extends GeocodeRule {
@@ -138,8 +137,9 @@ private PairValidation validMatch(PlaceCandidate nm) {

return validation;
}
private boolean canIgnore(PlaceCandidate mention){
if (ignoreShortLowercase(mention)){

private boolean canIgnore(PlaceCandidate mention) {
if (ignoreShortLowercase(mention)) {
return true;
}
// Remarked mention
@@ -334,7 +334,7 @@ public void evaluate(final List<PlaceCandidate> names) {
if (name.isFilteredOut()) {
continue;
}
if (name.hasCJKText()){
if (name.hasCJKText()) {
continue;
}

@@ -561,7 +561,7 @@ private void updateNameCodePair(PlaceCandidate n, PlaceCandidate code, Place cod
//
for (ScoredPlace nameGeoScore : n.getPlaces()) {
Place nameGeo = nameGeoScore.getPlace();
if (nameGeo.isSame(codeGeo)) {
if (nameGeo.getFeatureCode().equals(codeGeo.getFeatureCode()) || nameGeo.isSame(codeGeo)) {
continue; /* Ignore choosing same location for repeated names */
}
if (!(nameGeo.isPopulated() || nameGeo.isAdministrative() || nameGeo.isSpot())) {
Original file line number Diff line number Diff line change
@@ -71,7 +71,7 @@ public static boolean isValidAbbreviation(String s) {
@Override
public void evaluate(List<PlaceCandidate> names) {
for (PlaceCandidate p : names) {
if (p.isValid() || p.getTokens() == null) {
if (p.isValid() || p.getTokens() == null || p.isFilteredOut()) {
// isValid: this place was marked by other rules as valid
// tokens: in general trivial geo name references (continents) are not analyzed
// and tokens may be null.
@@ -85,7 +85,7 @@ public void evaluate(List<PlaceCandidate> names) {
continue;
}

if (p.hasMiddleEasternText() || p.hasCJKText()){
if (p.hasMiddleEasternText() || p.hasCJKText()) {
continue;
}

@@ -162,7 +162,6 @@ public void evaluate(List<PlaceCandidate> names) {


/** Names of places should have about N=5 chars to non-chars.
*
* "A BC" 3:1 filtered out.
* "AB CD" 4:1 filterd out.
* "AB BCD" 5:1 possibly acceptable.
@@ -186,7 +185,7 @@ public static boolean assessPhraseDensity(TextMatch p) {
*/
public static boolean assessPhraseDensity(String name, int charRatio) {
int nonAlpha = TextUtils.countNonText(name);
if (nonAlpha==0){
if (nonAlpha == 0) {
return true;
}
return ((name.length() - nonAlpha) / nonAlpha) >= charRatio;
Original file line number Diff line number Diff line change
@@ -58,7 +58,7 @@ public static boolean complementaryPostal(final Place geo1, final Place geo2) {
private boolean exceedsInnerSpanPunctuation(String buf, PlaceCandidate p1, PlaceCandidate p2) {
int x1 = p1.end < p2.start ? p1.end : p2.end;
int x2 = p1.end < p2.start ? p2.start : p1.start;
if (x1 >=x2){
if (x1 >= x2) {
return false;
}
String span = buf.substring(x1, x2);
12 changes: 0 additions & 12 deletions src/main/java/org/opensextant/output/Transforms.java
Original file line number Diff line number Diff line change
@@ -18,7 +18,6 @@
import org.opensextant.extractors.xtemporal.DateMatch;
import org.opensextant.processing.Parameters;
import org.opensextant.util.GeodeticUtility;
import org.opensextant.util.GeonamesUtility;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@@ -418,17 +417,6 @@ public static JsonObject toJSON(final List<TextMatch> matches, final Parameters
node.put("method", resolvedPlace.getMethod());
} else if (jobParams.tag_places || jobParams.tag_postal) {

// IF Caller is not asking for "codes" output....
if (!jobParams.tag_codes) {
boolean qualified = place.isDerived() || place.isValid();
// Filter out non-Postal codes if user is not requesting "codes" to be listed.
if (resolvedPlace.isCode() && !qualified && !GeonamesUtility.isPostal(resolvedPlace)) {
/* Given a bare token ' MA ' not attached to another term,
* this would be considered just a code. Caller must add "codes" to request to get these.
*/
continue;
}
}
/*
* Conf = 20 or greater to be geocoded.
*/
Original file line number Diff line number Diff line change
@@ -5,15 +5,18 @@

import jodd.json.JsonObject;
import org.json.JSONException;
import org.opensextant.data.Place;
import org.opensextant.data.TextInput;
import org.opensextant.extraction.Extractor;
import org.opensextant.extraction.TextMatch;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.extractors.geo.PostalGeocoder;
import org.opensextant.extractors.xtemporal.XTemporal;
import org.opensextant.output.Transforms;
import org.opensextant.processing.Parameters;
import org.opensextant.processing.RuntimeTools;
import org.opensextant.util.GeonamesUtility;
import org.opensextant.xlayer.server.TaggerResource;
import org.restlet.Context;
import org.restlet.data.CharacterSet;
@@ -177,6 +180,7 @@ public Representation process(TextInput input, Parameters jobParams) {
if (isDebug()) {
debug(String.format("CURRENT MEM USAGE(K)=%d", RuntimeTools.reportMemory()));
}
filter(matches, jobParams);
/*
* transform matches as JSON output.
*/
@@ -188,6 +192,34 @@ public Representation process(TextInput input, Parameters jobParams) {
}
}

private void filter(List<TextMatch> matches, Parameters jobParams) {

for (TextMatch m : matches) {
// Big loop for conditionals... Only one special condition currently:
//
// 1. IF Caller is not asking for "codes" output.... the omit any postal codes or state/ADM1 codes
// that are not fully resolved.
if (!jobParams.tag_codes) {
if (m instanceof PlaceCandidate) {
PlaceCandidate place = (PlaceCandidate) m;
Place resolvedPlace = place.getChosenPlace();
if (resolvedPlace != null && resolvedPlace.isCode()) {
// This condition differentiates matches -- looking to evaluate only inferred places that are codes.
// Cases: CODE -- Bare CODE. although resolved, its likely noise.
// Cases: CODE1 CODE2 -- CODE1 is resolved, but related CODE2 is not. Its noise. "AB CD", "MA VA"
boolean qualified = place.isDerived() || place.isValid();
// Filter out non-Postal codes if user is not requesting "codes" to be listed.
if (!qualified && !GeonamesUtility.isPostal(resolvedPlace)) {
place.setFilteredOut(true);
} else if (place.isShortName() && !place.hasResolvedRelated()) {
place.setFilteredOut(true);
}
}
}
}
}
}

/**
* Format matches as JSON
*

0 comments on commit 66a37d9

Please sign in to comment.