Skip to content

Commit

Permalink
Merge branch 'master' of github.com:OpenSextant/Xponents
Browse files Browse the repository at this point in the history
Merge 3.6.6 back to python working
* 'master' of github.com:OpenSextant/Xponents:
  URL pattern is host:port, so caller must add path...
  Sonatype Nexus staging compat
  year
  Maven: removed checkstyle requirement; updated Maven plugins
  Maven: removed checkstyle requirement; updated Maven plugins
  Rules - final cleanup on rules and optimizing where abbreviation detection occurs.
  PlaceGeocoder: Testing;  add flag to emit lower case and codes
  CORE: TextUtils now has abbreviation detector
  Version updates, reformatting.
  reformat docs
  Py3.11 run of pydoc
  Java17 run of API docs
  logging patches
  Testing resources
  java17 javadoc doclint
  3.6.6: noise handling in various language modes
  Python: FlexPat performance tuning -- flag omissions to avoid long repetitive loops with pattern matching
  UTIL: detect unicode in arabic or hwbrew easily
  ver 3.6.6 bump -- address some snyk dependency patches
  • Loading branch information
mubaldino committed Jan 17, 2024
2 parents 4016aa9 + 9e5251d commit ef19464
Show file tree
Hide file tree
Showing 490 changed files with 67,385 additions and 112,286 deletions.
26 changes: 10 additions & 16 deletions Core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.opensextant</groupId>
<artifactId>opensextant-xponents-core</artifactId>
<version>3.6.5</version>
<version>3.6.6</version>
<packaging>jar</packaging>
<name>OpenSextant Xponents Core API</name>
<description>An information extraction toolkit focused on geography and temporal entities</description>
Expand Down Expand Up @@ -34,9 +34,8 @@
</developer>
</developers>
<properties>
<checkstyle.skip>true</checkstyle.skip>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<slf4j.version>2.0.6</slf4j.version>
<slf4j.version>2.0.9</slf4j.version>
</properties>
<!-- OSS Sonatype instructions: list repositories -->
<distributionManagement>
Expand Down Expand Up @@ -83,7 +82,7 @@
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.4.4</version>
<version>1.4.14</version>
<scope>test</scope>
</dependency>
</dependencies>
Expand Down Expand Up @@ -203,12 +202,12 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.10.1</version>
<version>3.12.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.4.0</version>
<version>3.6.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
Expand All @@ -218,12 +217,12 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>
<version>3.0.0</version>
<version>3.1.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.2.2</version>
<version>3.3.0</version>
<executions>
<execution>
<id>attach-tests</id>
Expand All @@ -233,20 +232,15 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<version>3.1.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>3.0.0-M6</version>
<version>3.2.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>3.0.0-M5</version>
<version>3.0.1</version>
</plugin>
<plugin>
<groupId>org.sonarsource.scanner.maven</groupId>
Expand Down Expand Up @@ -411,7 +405,7 @@
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.8</version>
<version>1.6.13</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
Expand Down
104 changes: 100 additions & 4 deletions Core/src/main/java/org/opensextant/util/TextUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public static boolean hasIrregularPunctuation(String t) {
public static int countIrregularPunctuation(String t) {
int count = 0;
Matcher m = commonPunct.matcher(t);
while(m.find()){
while (m.find()) {
++count;
}
return count;
Expand Down Expand Up @@ -129,6 +129,31 @@ public static final boolean isLatin(String data) {
return isLatin;
}

/**
* Detects the first Arabic or Hewbrew character for now -- will be more comprehensive
* in scoping "Middle Eastern" scripts in text.
*
* @param data
* @return
*/
public static final boolean hasMiddleEasternText(String data) {
char[] ch = data.toCharArray();
for (char c : ch) {
// Non-letters and ASCII do not count.
if (isASCII(c) || !Character.isLetter(c)) {
continue;
}

Character.UnicodeBlock blk = Character.UnicodeBlock.of(c);
if (blk == Character.UnicodeBlock.ARABIC
|| blk == Character.UnicodeBlock.ARABIC_EXTENDED_A
|| blk == Character.UnicodeBlock.HEBREW) {
return true;
}
}
return false;
}

/**
* Helpful hints on parsing Unicode phrases. Reference:
* http://www.rgagnon.com/javadetails/java-0456.html
Expand Down Expand Up @@ -448,14 +473,14 @@ public static final boolean isNumeric(final String v) {
}

char ch0 = v.charAt(0);
if (!(Character.isDigit(ch0) || ch0 == '.' || ch0 == '-' || ch0 == '+' )){
if (!(Character.isDigit(ch0) || ch0 == '.' || ch0 == '-' || ch0 == '+')) {
return false;
}
for (char ch : v.toCharArray()) {
/*
* Is the character in .-+Ee or SPACE?
*/
if (ch == '.' || ch == ',' || ch == '-' || ch == '+' || ch == 'e' || ch == 'E' || ch==' ') {
if (ch == '.' || ch == ',' || ch == '-' || ch == '+' || ch == 'e' || ch == 'E' || ch == ' ') {
continue;
}
if (!Character.isDigit(ch)) {
Expand Down Expand Up @@ -766,6 +791,7 @@ public static String b2hex(byte[] barr) {
* @return hash for the data
* @deprecated not MD5 specific. Use #b2hex() instead
*/
@Deprecated
public static String md5_id(byte[] digest) {
// Thanks to javacream:
// create hex string from the 16-byte hash
Expand Down Expand Up @@ -1009,6 +1035,76 @@ public static String normalizeAbbreviation(String word) {
return word.replace(".", "");
}

/**
* @see #isAbbreviation(String, boolean)
* @param txt
* @return
*/
public static boolean isAbbreviation(String txt) {
return isAbbreviation(txt, true);
}

public static final int ABBREV_MAX_LEN = 15;

/**
* Define what an acronym is: A.B. (at minimum)
* A.b. okay
* A. b. okay
* A.b not okay
* A.9. not okay
*
* Starts with Alpha
* Period is required
* Ends with a period
* One upper case letter required -- optional arg for case sensitivity
* Digits allowed.
* Spaces allowed - length no longer than 15 non-whitespace chars
*
*/
public static boolean isAbbreviation(String orig, boolean useCase) {
String txt = orig.trim();
if (txt.length() == 0) {
return false;
}
char[] chars = txt.toCharArray();
int l = txt.length();
if (useCase && !(Character.isUpperCase(chars[0]) && Character.isLetter(chars[0]))) {
return false;
}
if (!(chars[l - 1] == '.')) {
return false;
}
if (txt.length() > ABBREV_MAX_LEN) {
return false;
}

// Have to iterate through all chars
int periods = 0;
int spaces = 0;
for (char c : chars) {
if (!isASCII(c)) {
return false;
}
if (c == '.') {
++periods;
continue;
}
if (c == ' ') {
++spaces;
continue;
}
if (Character.isLetter(c) || Character.isDigit(c)) {
continue;
}
// Phrase contains other than A-Z, 0-9, . and SP
return false;
}
if ((0 < spaces && periods < 2) || periods < spaces) {
return false;
}
return true;
}

/**
* Supports Phoneticizer utility from OpenSextant v1.x Remove diacritics from a
* phrase
Expand Down Expand Up @@ -1689,7 +1785,7 @@ public static int countNonText(final String t) {

int nonText = 0;
for (char c : t.toCharArray()) {
if (!Character.isLetter(c) && Character.isDigit(c) && Character.isWhitespace(c)) {
if (!Character.isLetter(c) && !Character.isDigit(c)) {
++nonText;
}
}
Expand Down
43 changes: 38 additions & 5 deletions Core/src/test/java/TestTextUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ public void testDigests() {
String test = "a Ö ø Ø é å Å 杨寨 5 ! ē M ē ā";
String test_id = TextUtils.text_id(test);
assertEquals("35efe2bea1868a02530b012180d2f7e6f949040b", test_id);
} catch (Exception err){
fail("Algs? "+err.getMessage());
} catch (Exception err) {
fail("Algs? " + err.getMessage());
}
}

@Test
public void testPunct(){
public void testPunct() {
String testText = "Eat at Bob\"s | Country Bunker";
assertTrue(TextUtils.hasIrregularPunctuation(testText));
assertEquals(2, TextUtils.countIrregularPunctuation(testText));
Expand Down Expand Up @@ -107,7 +107,7 @@ public void testScriptDetection() {
assertTrue(TextUtils.isLatin("O a b c d O"));

String t2 = Unimap.replaceDiacritics(t_original);
if (!t2.equals(t_remapped)){
if (!t2.equals(t_remapped)) {
fail("Diacritics not replaced!");
}
}
Expand All @@ -120,6 +120,13 @@ public void testLanguageCodes() {
assertEquals("French", TextUtils.getLanguage("FRENCH").getName());
}

@Test
public void testMidEastLanguages() {
assertTrue(TextUtils.hasMiddleEasternText("تشییع پیکر سردار شهید سید رض\u200Cالسلام آغازABC 111 "));
assertTrue(TextUtils.hasMiddleEasternText("עִבְרִית"));
assertFalse(TextUtils.hasMiddleEasternText("1 2 3 4 Z Y X "));
}

@Test
public void testCase() {

Expand Down Expand Up @@ -179,7 +186,12 @@ public void testCase() {
}

@Test
public void testNumerics(){
public void testCharCounting() {
assertEquals(1, TextUtils.countNonText("bob bob"));
}

@Test
public void testNumerics() {
// Valid number patterns.
assertTrue(TextUtils.isNumeric("5.67E2"));
assertTrue(TextUtils.isNumeric("+5.67E2"));
Expand All @@ -190,4 +202,25 @@ public void testNumerics(){
assertFalse(TextUtils.isNumeric("E5.672"));
assertFalse(TextUtils.isNumeric("abcdef"));
}

@Test
public void testAbbreviations() {
assertTrue(TextUtils.isAbbreviation("A.B."));
assertTrue(TextUtils.isAbbreviation("A. B."));
assertTrue(TextUtils.isAbbreviation("A. B. "));
assertTrue(TextUtils.isAbbreviation("A.b."));
assertTrue(TextUtils.isAbbreviation("A.9."));
assertTrue(TextUtils.isAbbreviation("Bs.As."));
assertTrue(TextUtils.isAbbreviation("Miss."));
assertTrue(TextUtils.isAbbreviation("Ms."));

// NOT Abbreviations:
assertFalse(TextUtils.isAbbreviation("A. B. "));
assertFalse(TextUtils.isAbbreviation("Sent End. New Sent"));
assertFalse(TextUtils.isAbbreviation("Sent End. New Sent or Long Sentence"));
assertFalse(TextUtils.isAbbreviation("1.2.23"));
assertFalse(TextUtils.isAbbreviation("A.B"));
assertFalse(TextUtils.isAbbreviation("A.@."));
assertFalse(TextUtils.isAbbreviation("$1.000R"));
}
}
12 changes: 6 additions & 6 deletions Examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
<artifactId>opensextant-xponents-examples</artifactId>
<properties>
<checkstyle.skip>true</checkstyle.skip>
<slf4j.version>2.0.6</slf4j.version>
<xponents.version>3.6.5</xponents.version>
<xponents-core.version>3.6.5</xponents-core.version>
<xtext.version>3.6.5</xtext.version>
<slf4j.version>2.0.9</slf4j.version>
<xponents.version>3.6.6</xponents.version>
<xponents-core.version>3.6.6</xponents-core.version>
<xtext.version>3.6.6</xtext.version>
</properties>
<dependencies>
<dependency>
Expand Down Expand Up @@ -77,7 +77,7 @@
<dependency>
<groupId>org.apache.groovy</groupId>
<artifactId>groovy</artifactId>
<version>4.0.11</version>
<version>4.0.17</version>
<scope>runtime</scope>
</dependency>
<dependency>
Expand All @@ -99,7 +99,7 @@
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.4.4</version>
<version>1.4.14</version>
<scope>runtime</scope>
</dependency>
</dependencies>
Expand Down
2 changes: 1 addition & 1 deletion NOTICE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@


Copyright 2013-2021 MITRE Corporation. All Rights Reserved.
Copyright 2013-2023 MITRE Corporation. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
Loading

0 comments on commit ef19464

Please sign in to comment.