From b0c6f3a48f115d4f08babd490345a74b63d2380d Mon Sep 17 00:00:00 2001 From: Martin Wiesner Date: Thu, 11 Jul 2024 09:19:27 +0200 Subject: [PATCH] Reduce compile and runtime dependencies in Similarity Component (#157) Reduce compile and runtime dependency in Similarity Component --- opennlp-similarity/pom.xml | 992 +++++++++--------- .../MachineTranslationWrapper.java | 22 +- .../tools/doc_classifier/DocClassifier.java | 43 +- ...sifierTrainingSetMultilingualExtender.java | 3 +- .../DocClassifierTrainingSetVerifier.java | 18 +- .../tools/textsimilarity/ParseTreeChunk.java | 3 +- 6 files changed, 511 insertions(+), 570 deletions(-) diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index bb8aa6eb..5f3029de 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -12,41 +12,41 @@ language governing permissions and limitations under the License. --> - 4.0.0 - - org.apache.opennlp - opennlp-sandbox - 2.3.4-SNAPSHOT - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + 4.0.0 + + org.apache.opennlp + opennlp-sandbox + 2.3.4-SNAPSHOT + - opennlp-similarity - 2.3.4-SNAPSHOT - jar + opennlp-similarity + 2.3.4-SNAPSHOT + jar - Apache OpenNLP Tool Similarity distribution - - - 1.0.0-M2.1 - 1.14.3-1.5.10 - 1.5.10 - 0.3.26-1.5.10 - + Apache OpenNLP Similarity distribution - - - central - Maven Central Repository - https://repo1.maven.org/maven2 - - - billylieurance-net - https://www.billylieurance.net/maven2 - - false - - - + + 1.0.0-M2.1 + 1.14.3-1.5.10 + 1.5.10 + 0.3.26-1.5.10 + + + + + central + Maven Central Repository + https://repo1.maven.org/maven2 + + + billylieurance-net + https://www.billylieurance.net/maven2 + + false + + + @@ -84,501 +84,467 @@ - - - org.apache.opennlp - opennlp-tools - - - - org.slf4j - slf4j-api - - - - org.apache.logging.log4j - log4j-api - test - - - org.apache.logging.log4j - log4j-core - test - - - org.apache.logging.log4j - log4j-slf4j2-impl - test - + + + org.apache.opennlp + opennlp-tools + - - org.junit.jupiter - junit-jupiter-api - + + org.slf4j + slf4j-api + - - org.junit.jupiter - junit-jupiter-engine - + + commons-lang + commons-lang + + + commons-codec + commons-codec + + + commons-collections + commons-collections + + + org.apache.commons + commons-math3 + + + org.json + json + 20240303 + + + org.apache.tika + tika-app + 2.9.2 + + + net.sf.opencsv + opencsv + 2.3 + - - org.junit.jupiter - junit-jupiter-params - + + org.apache.solr + solr-core + 8.11.3 + + + org.apache.hadoop + * + + + org.eclipse.jetty + * + + + org.eclipse.jetty.http2 + * + + + - - commons-lang - commons-lang - - - commons-codec - commons-codec - - - commons-logging - commons-logging - - - commons-collections - commons-collections - - - org.apache.commons - commons-math3 - + + javax.mail + mail + 1.4.7 + + + com.restfb + restfb + 1.49.0 + - - org.json - json - 20240303 - - - org.apache.tika - tika-app - 2.9.2 - - - net.sf.opencsv - opencsv - 2.3 - + + net.billylieurance.azuresearch + azure-bing-search-java + 0.13.0 + - - org.apache.solr - solr-core - 8.11.3 - - - - org.apache.httpcomponents - httpclient - - - org.apache.httpcomponents - httpclient-cache - - - org.apache.httpcomponents - httpcore - - - org.apache.httpcomponents - httpmime - - - org.apache.httpcomponents - fluent-hc - + + edu.mit + jverbnet + 1.2.0.1 + + + ch.qos.logback + logback-core + + + ch.qos.logback + logback-classic + + + + org.slf4j + log4j-over-slf4j + + + - - org.jgrapht - jgrapht-jdk1.5 - 0.7.3 - - - de.jollyday - jollyday - 0.5.10 - - - jgraph - jgraph - 5.13.0.0 - - - javax.mail - mail - 1.4.7 - - - com.restfb - restfb - 1.49.0 - - - com.memetix - microsoft-translator-java-api - 0.6.2 - + + org.docx4j + docx4j + 6.1.2 + + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + - - net.billylieurance.azuresearch - azure-bing-search-java - 0.13.0 - + + org.deeplearning4j + deeplearning4j-core + ${dl4j.version} + + + + org.bytedeco + openblas-platform + + + org.bytedeco + hdf5-platform + + + + org.datavec + datavec-data-image + + + + + org.deeplearning4j + deeplearning4j-ui + ${dl4j.version} + + + org.deeplearning4j + deeplearning4j-nlp + ${dl4j.version} + - - edu.mit - jverbnet - 1.2.0.1 - - - ch.qos.logback - logback-core - - - ch.qos.logback - logback-classic - - - - org.slf4j - log4j-over-slf4j - - - - - - org.docx4j - docx4j - 6.1.2 - - - - org.slf4j - slf4j-log4j12 - - - log4j - log4j - - - + + org.bytedeco + openblas + ${openblas.version} + + + org.bytedeco + javacpp + ${javacpp.version} + - - org.deeplearning4j - deeplearning4j-core - ${dl4j.version} - - - - org.bytedeco - openblas-platform - - - org.bytedeco - hdf5-platform - - - - org.datavec - datavec-data-image - - - - - org.deeplearning4j - deeplearning4j-ui - ${dl4j.version} - - - org.deeplearning4j - deeplearning4j-nlp - ${dl4j.version} - + + + org.junit.jupiter + junit-jupiter-api + + + org.junit.jupiter + junit-jupiter-engine + + + org.junit.jupiter + junit-jupiter-params + + + org.apache.logging.log4j + log4j-api + test + + + org.apache.logging.log4j + log4j-core + test + + + org.apache.logging.log4j + log4j-slf4j2-impl + test + + - - org.bytedeco - openblas - ${openblas.version} - - - org.bytedeco - javacpp - ${javacpp.version} - - + + + platform-win-x64 + + + Windows + x64 + + + + + org.bytedeco + javacpp + ${javacpp.version} + windows-x86_64 + runtime + + + org.bytedeco + openblas + ${openblas.version} + windows-x86_64 + runtime + + + org.bytedeco + hdf5 + ${hdf5.version} + windows-x86_64 + runtime + + + + + platform-win-x86 + + + Windows + x86 + + + + + org.bytedeco + javacpp + ${javacpp.version} + windows-x86 + runtime + + + org.bytedeco + openblas + ${openblas.version} + windows-x86 + runtime + + + org.bytedeco + hdf5 + ${hdf5.version} + windows-x86 + runtime + + + + + platform-linux-x64 + + + unix + Linux + amd64 + + + + + org.bytedeco + javacpp + ${javacpp.version} + linux-x86_64 + runtime + + + org.bytedeco + openblas + ${openblas.version} + linux-x86_64 + runtime + + + org.bytedeco + hdf5 + ${hdf5.version} + linux-x86_64 + runtime + + + + + platform-macosx-x64 + + + Mac + x64 + + + + + org.bytedeco + javacpp + ${javacpp.version} + macosx-x86_64 + runtime + + + org.bytedeco + openblas + ${openblas.version} + macosx-x86_64 + runtime + + + org.bytedeco + hdf5 + ${hdf5.version} + macosx-x86_64 + runtime + + + + + platform-macosx-aarch64 + + + mac + aarch64 + + + + + org.bytedeco + javacpp + ${javacpp.version} + macosx-arm64 + runtime + + + org.bytedeco + openblas + ${openblas.version} + macosx-arm64 + runtime + + + + + + - - - platform-win-x64 - - - Windows - x64 - - - - - org.bytedeco - javacpp - ${javacpp.version} - windows-x86_64 - runtime - - - org.bytedeco - openblas - ${openblas.version} - windows-x86_64 - runtime - - - org.bytedeco - hdf5 - ${hdf5.version} - windows-x86_64 - runtime - - - - - platform-win-x86 - - - Windows - x86 - - - - - org.bytedeco - javacpp - ${javacpp.version} - windows-x86 - runtime - - - org.bytedeco - openblas - ${openblas.version} - windows-x86 - runtime - - - org.bytedeco - hdf5 - ${hdf5.version} - windows-x86 - runtime - - - - - platform-linux-x64 - - - unix - Linux - amd64 - - - - - org.bytedeco - javacpp - ${javacpp.version} - linux-x86_64 - runtime - - - org.bytedeco - openblas - ${openblas.version} - linux-x86_64 - runtime - - - org.bytedeco - hdf5 - ${hdf5.version} - linux-x86_64 - runtime - - - - - platform-macosx-x64 - - - Mac - x64 - - - - - org.bytedeco - javacpp - ${javacpp.version} - macosx-x86_64 - runtime - - - org.bytedeco - openblas - ${openblas.version} - macosx-x86_64 - runtime - - - org.bytedeco - hdf5 - ${hdf5.version} - macosx-x86_64 - runtime - - - - - platform-macosx-aarch64 - - - mac - aarch64 - - - - - org.bytedeco - javacpp - ${javacpp.version} - macosx-arm64 - runtime - - - org.bytedeco - openblas - ${openblas.version} - macosx-arm64 - runtime - - - - - - + + + + org.apache.maven.plugins + maven-compiler-plugin + + ${maven.compiler.source} + ${maven.compiler.target} + -Xlint + + - - - - org.apache.maven.plugins - maven-compiler-plugin - - ${maven.compiler.source} - ${maven.compiler.target} - -Xlint - - + + org.apache.maven.plugins + maven-surefire-plugin + + -Xmx2048m -Dfile.encoding=UTF-8 + ${opennlp.forkCount} + false + false + + **/*IT.java + + + - - org.apache.maven.plugins - maven-surefire-plugin - - -Xmx2048m -Dfile.encoding=UTF-8 - ${opennlp.forkCount} - false - false - - **/*IT.java - - - + + maven-source-plugin + + + create-source-jar + + jar + + package + + + - - maven-source-plugin - - - create-source-jar - - jar - - package - - - - - - maven-antrun-plugin - - - generate checksums for binary artifacts - - run - - verify - - - - - - - - - - - - - - - - - - - - - maven-assembly-plugin - - - src - - single - - package - - - src/main/assembly/assembly.xml - - - - - source-release-assembly - - true - forked-path - - - - + + maven-antrun-plugin + + + generate checksums for binary artifacts + + run + + verify + + + + + + + + + + + + + + + + + + + + + maven-assembly-plugin + + + src + + single + + package + + + src/main/assembly/assembly.xml + + + + + source-release-assembly + + true + forked-path + + + + - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.7.0 - true - - ossrh - https://oss.sonatype.org/ - true - - - - + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.7.0 + true + + ossrh + https://oss.sonatype.org/ + true + + + + \ No newline at end of file diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java index 2db4f120..8f08443d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java @@ -36,6 +36,7 @@ public class MachineTranslationWrapper { public String translate(String sentence, String lang2lang) { if (sentence==null) return null; + String request = TRANSLATOR_URL + sentence.replace(' ','+') + "&langpair="+lang2lang;//"en|es"; try { URL urlC = new URI(request).toURL(); @@ -43,17 +44,18 @@ public String translate(String sentence, String lang2lang) { String line; StringBuilder result = new StringBuilder(); - BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); - int count = 0; - while ((line = reader.readLine()) != null) - { - result.append(line); - count++; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + int count = 0; + while ((line = reader.readLine()) != null) + { + result.append(line); + count++; + } + JSONObject rootObject = new JSONObject(result.toString()); + JSONObject findObject = rootObject.getJSONObject("responseData"); + String transl = findObject.getString("translatedText"); + return URLDecoder.decode(transl, StandardCharsets.UTF_8); } - JSONObject rootObject = new JSONObject(result.toString()); - JSONObject findObject = rootObject.getJSONObject("responseData"); - String transl = findObject.getString("translatedText"); - return URLDecoder.decode(transl, StandardCharsets.UTF_8); } catch (IOException | URISyntaxException | JSONException e) { e.printStackTrace(); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java index ccd9f637..41bec16c 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java @@ -29,8 +29,6 @@ import opennlp.tools.textsimilarity.TextProcessor; import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; @@ -44,30 +42,25 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DocClassifier { - private static final Log LOGGER = LogFactory.getLog(DocClassifier.class); + private static final Logger LOGGER = LoggerFactory.getLogger(DocClassifier.class); public static final String DOC_CLASSIFIER_KEY = "doc_class"; public static final String RESOURCE_DIR = null; private Map scoredClasses; - public static final Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f; protected static IndexReader indexReader = null; protected static IndexSearcher indexSearcher = null; // resource directory plus the index folder - private static final String INDEX_PATH = RESOURCE_DIR - + ClassifierTrainingSetIndexer.INDEX_PATH; + private static final String INDEX_PATH = RESOURCE_DIR + ClassifierTrainingSetIndexer.INDEX_PATH; // http://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm private static final int MAX_DOCS_TO_USE_FOR_CLASSIFY = 10, // 10 similar - // docs for - // nearest - // neighbor - // settings - + // docs for nearest neighbor settings MAX_CATEG_RESULTS = 2; private static final float BEST_TO_NEX_BEST_RATIO = 2.0f; // to accumulate classif results @@ -112,7 +105,7 @@ public class DocClassifier { } } - public DocClassifier(String inputFilename, JSONObject inputJSON) { + public DocClassifier(String inputFilename) { scoredClasses = new HashMap<>(); } @@ -131,18 +124,15 @@ private List classifySentence(String queryStr) { Query query; try { query = parser.parse(queryStr); - } catch (ParseException e2) { - return results; } TopDocs hits = null; // TopDocs search(Query, int) // Finds the top n hits for query. try { - hits = indexSearcher - .search(query, MAX_DOCS_TO_USE_FOR_CLASSIFY + 2); + hits = indexSearcher.search(query, MAX_DOCS_TO_USE_FOR_CLASSIFY + 2); } catch (IOException e1) { - LOGGER.error("problem searching index \n" + e1); + LOGGER.error("problem searching index \n", e1); } LOGGER.debug("Found " + hits.totalHits + " hits for " + queryStr); int count = 0; @@ -175,8 +165,7 @@ private List classifySentence(String queryStr) { } try { scoredClasses = ValueSortMap.sortMapByValue(scoredClasses, false); - List resultsAll = new ArrayList<>( - scoredClasses.keySet()), resultsAboveThresh = new ArrayList<>(); + List resultsAll = new ArrayList<>(scoredClasses.keySet()), resultsAboveThresh = new ArrayList<>(); for (String key : resultsAll) { if (scoredClasses.get(key) > MIN_TOTAL_SCORE_FOR_CATEGORY) resultsAboveThresh.add(key); @@ -211,15 +200,11 @@ private List classifySentence(String queryStr) { } - - - public static String formClassifQuery(String pageContentReader, int maxRes) { // We want to control which delimiters we substitute. For example '_' & // \n we retain - pageContentReader = pageContentReader.replaceAll("[^A-Za-z0-9 _\\n]", - ""); + pageContentReader = pageContentReader.replaceAll("[^A-Za-z0-9 _\\n]", ""); Scanner in = new Scanner(pageContentReader); in.useDelimiter("\\s+"); @@ -258,11 +243,9 @@ public void close() { } } - /* * Main entry point for classifying sentences */ - public List getEntityOrClassFromText(String content) { List sentences = TextProcessor.splitToSentences(content); @@ -284,7 +267,6 @@ public List getEntityOrClassFromText(String content) { LOGGER.debug(sentence + " => " + classifResults); } } - } catch (Exception e) { LOGGER.error("Problem classifying sentence\n " + e); } @@ -294,11 +276,10 @@ public List getEntityOrClassFromText(String content) { aggrResults = localCats.getFrequentTags(); - LOGGER.debug(localCats.getFrequentTags()); + LOGGER.debug(localCats.getFrequentTags().toString()); } catch (Exception e) { - LOGGER.error("Problem aggregating search results\n" + e); + LOGGER.error("Problem aggregating search results\n", e); } return aggrResults; } - } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java index 90501ade..29a51072 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java @@ -33,7 +33,6 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; -import org.json.JSONObject; /* * This utility gets 'training_corpus' as input and creates a new version of training_corpus with verified files. @@ -56,7 +55,7 @@ public class DocClassifierTrainingSetMultilingualExtender { public DocClassifierTrainingSetMultilingualExtender(String resource) { - classifier = new DocClassifier("", new JSONObject()); + classifier = new DocClassifier(""); } private final int FRAGMENT_LENGTH = 500; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java index 4da160a5..95c2b276 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java @@ -26,33 +26,28 @@ import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; -import org.json.JSONObject; /* * This utility gets 'training_corpus' as input and creates a new version of training_corpus with verified files. * Verified => classified by existing training set as only belonging to its target category, no other categories, not empty. */ public class DocClassifierTrainingSetVerifier { + + private static final int FRAGMENT_LENGTH = 500; public static String projectHome = new File(".").getAbsolutePath(); - public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources"; + public static String resourceDir = projectHome.replace("/.", "") + "/src/main/resources"; DocClassifier classifier; private String sourceDir = null, destinationDir = null; - protected final ArrayList queue = new ArrayList<>(); - protected final Tika tika = new Tika(); - public DocClassifierTrainingSetVerifier(String resource) { - - - classifier = new DocClassifier("", new JSONObject()); + public DocClassifierTrainingSetVerifier(String resource) { + classifier = new DocClassifier(""); } - private static final int FRAGMENT_LENGTH = 500; protected void addFiles(File file) { - try { if (!file.exists()) { System.out.println(file + " does not exist."); @@ -90,8 +85,7 @@ public void processDirectory(String fileName) throws IOException { //if (f.getName().indexOf(".html")<0) //continue; - classifier = new DocClassifier("", new JSONObject()); - + classifier = new DocClassifier(""); content = tika.parseToString(f); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java index 409172b6..82242730 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java @@ -421,11 +421,10 @@ public Boolean equalsTo(ParseTreeChunk ch) { } public boolean equals(ParseTreeChunk ch) { - List lems = ch.getLemmas(); - List poss = ch.POSs; return ListUtils.isEqualList(ch.getLemmas(), this.lemmas) && ListUtils.isEqualList(ch.getPOSs(), this.POSs); } + @Override public String toString() { StringBuilder buf = new StringBuilder(" ["); if (mainPOS != null)