Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl #676

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,3 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

============================================================================

jackson-databind
https://github.com/FasterXML/jackson-databind
The Apache Software License, Version 2.0
7 changes: 0 additions & 7 deletions opennlp-brat-annotator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,6 @@
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
<scope>runtime</scope>
<exclusions>
<!-- Byte-Buddy became a dependency by accident - TODO remove it with update version > 2.17.0 -->
<exclusion>
<groupId>net.bytebuddy</groupId>
<artifactId>byte-buddy</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
Expand Down
12 changes: 0 additions & 12 deletions opennlp-dl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,6 @@
<artifactId>onnxruntime</artifactId>
<version>${onnxruntime.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
<exclusions>
<!-- Byte-Buddy became a dependency by accident - TODO remove it with update version > 2.17.0 -->
<exclusion>
<groupId>net.bytebuddy</groupId>
<artifactId>byte-buddy</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,41 @@
package opennlp.dl.doccat;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DocumentCategorizerConfig {
public record DocumentCategorizerConfig(Map<String, String> id2label) {

private Map<String, String> id2label;
private static final Pattern ID_TO_LABEL_PATTERN =
Pattern.compile("\"id2label\"\\s*:\\s*\\{(.*?)\\}", Pattern.DOTALL);
private static final Pattern ENTRY_PATTERN =
Pattern.compile("\"([^\"]+)\"\\s*:\\s*\"(.*?)\"");

public Map<String, String> getId2label() {
@Override
public Map<String, String> id2label() {
return Collections.unmodifiableMap(id2label);
}

public void setId2label(Map<String, String> id2label) {
this.id2label = id2label;
}
public static DocumentCategorizerConfig fromJson(String json) {
Objects.requireNonNull(json, "json must not be null");

final Map<String, String> id2label = new HashMap<>();
final Matcher matcher = ID_TO_LABEL_PATTERN.matcher(json);

if (matcher.find()) {
final String id2labelContent = matcher.group(1);
final Matcher entryMatcher = ENTRY_PATTERN.matcher(id2labelContent);

while (entryMatcher.find()) {
final String key = entryMatcher.group(1);
final String value = entryMatcher.group(2);
id2label.put(key, value);
}
}

return new DocumentCategorizerConfig(id2label);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.File;
import java.io.IOException;
import java.nio.LongBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.HashMap;
Expand All @@ -36,8 +37,6 @@
import ai.onnxruntime.OrtEnvironment;
import ai.onnxruntime.OrtException;
import ai.onnxruntime.OrtSession;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -67,16 +66,15 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor
/**
* Instantiates a {@link DocumentCategorizer document categorizer} using ONNX models.
*
* @param model The ONNX model file.
* @param vocabulary The model file's vocabulary file.
* @param categories The categories.
* @param model The ONNX model file.
* @param vocabulary The model file's vocabulary file.
* @param categories The categories.
* @param classificationScoringStrategy Implementation of {@link ClassificationScoringStrategy} used
* to calculate the classification scores given the score of each
* individual document part.
* @param inferenceOptions {@link InferenceOptions} to control the inference.
*
* @param inferenceOptions {@link InferenceOptions} to control the inference.
* @throws OrtException Thrown if the {@code model} cannot be loaded.
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
*/
public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, String> categories,
ClassificationScoringStrategy classificationScoringStrategy,
Expand All @@ -102,21 +100,21 @@ public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, String> c
/**
* Instantiates a {@link DocumentCategorizer document categorizer} using ONNX models.
*
* @param model The ONNX model file.
* @param vocabulary The model file's vocabulary file.
* @param config The model's config file. The file will be used to determine the classification categories.
* @param model The ONNX model file.
* @param vocabulary The model file's vocabulary file.
* @param config The model's config file. The file will be used to
* determine the classification categories.
* @param classificationScoringStrategy Implementation of {@link ClassificationScoringStrategy} used
* to calculate the classification scores given the score of each
* individual document part.
* @param inferenceOptions {@link InferenceOptions} to control the inference.
*
* @param inferenceOptions {@link InferenceOptions} to control the inference.
* @throws OrtException Thrown if the {@code model} cannot be loaded.
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
*/
public DocumentCategorizerDL(File model, File vocabulary, File config,
ClassificationScoringStrategy classificationScoringStrategy,
InferenceOptions inferenceOptions)
throws IOException, OrtException {
throws IOException, OrtException {

this.env = OrtEnvironment.getEnvironment();

Expand Down Expand Up @@ -175,7 +173,7 @@ public double[] categorize(String[] strings) {
logger.error("Unload to perform document classification inference", ex);
}

return new double[]{};
return new double[] {};

}

Expand Down Expand Up @@ -315,6 +313,7 @@ private List<Tokens> tokenize(final String text) {

/**
* Applies softmax to an array of values.
*
* @param input An array of values.
* @return The output array.
*/
Expand Down Expand Up @@ -346,18 +345,12 @@ private int maxIndex(double[] arr) {
}

private Map<Integer, String> readCategoriesFromFile(File config) throws IOException {

final String json = new String(Files.readAllBytes(config.toPath()));

final ObjectMapper objectMapper = new ObjectMapper();
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);

final DocumentCategorizerConfig documentCategorizerConfig =
objectMapper.readValue(json, DocumentCategorizerConfig.class);
DocumentCategorizerConfig.fromJson(Files.readString(config.toPath(), StandardCharsets.UTF_8));

final Map<Integer, String> categories = new HashMap<>();
for (final String key : documentCategorizerConfig.getId2label().keySet()) {
categories.put(Integer.valueOf(key), documentCategorizerConfig.getId2label().get(key));
for (final String key : documentCategorizerConfig.id2label().keySet()) {
categories.put(Integer.valueOf(key), documentCategorizerConfig.id2label().get(key));
}

return categories;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.dl.doccat;

import java.util.Map;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;


public class DocumentCategorizerConfigTest {

@Test
public void testId2LabelsFromJsonPrettyValid() {
final String json = """
{
"_num_labels": 5,
"architectures": [
"BertForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"finetuning_task": "sentiment-analysis",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {
"0": "1 star",
"1": "2 stars",
"2": "3 stars",
"3": "4 stars",
"4": "5 stars"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": {
"1 star": 0,
"2 stars": 1,
"3 stars": 2,
"4 stars": 3,
"5 stars": 4
},
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"output_past": true,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 105879
}
""";

final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
assertNotNull(config);
final Map<String, String> map = config.id2label();
assertEquals(5, map.size());
assertEquals("1 star", map.get("0"));
assertEquals("2 stars", map.get("1"));
assertEquals("3 stars", map.get("2"));
assertEquals("4 stars", map.get("3"));
assertEquals("5 stars", map.get("4"));
}

@Test
public void testId2LabelsFromJsonUglyValid() {
final String json = """
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_
dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"id2label":{"0":"1 star",
"1":"2 stars","2":"3 stars","3":"4 stars","4":"5 stars"},"initializer_range":0.02,
"intermediate_size":3072,"label2id":{"1 star":0,"2 stars":1,"3 stars":2,"4 stars":3,"5
stars":4},"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert",
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"
pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,
"vocab_size":105879}
""";

final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
assertNotNull(config);
final Map<String, String> map = config.id2label();
assertEquals(5, map.size());
assertEquals("1 star", map.get("0"));
assertEquals("2 stars", map.get("1"));
assertEquals("3 stars", map.get("2"));
assertEquals("4 stars", map.get("3"));
assertEquals("5 stars", map.get("4"));
}

@Test
public void testId2LabelsFromJsonNoValues() {
final String json = """
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs
_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"layer_norm_eps":1e-12,
"max_position_embeddings":512,"model_type":"bert",
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,
"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,
"vocab_size":105879}
""";

final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
assertNotNull(config);
assertEquals(0, config.id2label().size());
}

@Test
public void testId2LabelsFromJsonEmptyInput() {
final String json = "";
final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
assertNotNull(config);
assertEquals(0, config.id2label().size());
}

@Test
public void testId2LabelsFromJsonPrettyIdIsNotANumberValid() {
final String json = """
{
"_num_labels": 5,
"architectures": [
"BertForSequenceClassification"
],
"attention_probs_dropout_prob": 0.1,
"directionality": "bidi",
"finetuning_task": "sentiment-analysis",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"id2label": {
"a0": "1 star",
"a1": "2 stars",
"a2": "3 stars",
"a3": "4 stars",
"a4": "5 stars"
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": {
"1 star": "a0",
"2 stars": "a1",
"3 stars": "a2",
"4 stars": "a3",
"5 stars": "a4"
},
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"output_past": true,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"type_vocab_size": 2,
"vocab_size": 105879
}
""";

final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
assertNotNull(config);
final Map<String, String> map = config.id2label();
assertEquals(5, map.size());
assertEquals("1 star", map.get("a0"));
assertEquals("2 stars", map.get("a1"));
assertEquals("3 stars", map.get("a2"));
assertEquals("4 stars", map.get("a3"));
assertEquals("5 stars", map.get("a4"));
}
}