Skip to content

Commit 124aa8b

Browse files
committed
OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl module
1 parent 74c7d52 commit 124aa8b

File tree

6 files changed

+243
-56
lines changed

6 files changed

+243
-56
lines changed

NOTICE

-6
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,3 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
9393
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
9494
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
9595
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
96-
97-
============================================================================
98-
99-
jackson-databind
100-
https://github.com/FasterXML/jackson-databind
101-
The Apache Software License, Version 2.0

opennlp-brat-annotator/pom.xml

-7
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,6 @@
6161
<artifactId>jackson-databind</artifactId>
6262
<version>${jackson.version}</version>
6363
<scope>runtime</scope>
64-
<exclusions>
65-
<!-- Byte-Buddy became a dependency by accident - TODO remove it with update version > 2.17.0 -->
66-
<exclusion>
67-
<groupId>net.bytebuddy</groupId>
68-
<artifactId>byte-buddy</artifactId>
69-
</exclusion>
70-
</exclusions>
7164
</dependency>
7265

7366
<dependency>

opennlp-dl/pom.xml

-12
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,6 @@
4141
<artifactId>onnxruntime</artifactId>
4242
<version>${onnxruntime.version}</version>
4343
</dependency>
44-
<dependency>
45-
<groupId>com.fasterxml.jackson.core</groupId>
46-
<artifactId>jackson-databind</artifactId>
47-
<version>${jackson.version}</version>
48-
<exclusions>
49-
<!-- Byte-Buddy became a dependency by accident - TODO remove it with update version > 2.17.0 -->
50-
<exclusion>
51-
<groupId>net.bytebuddy</groupId>
52-
<artifactId>byte-buddy</artifactId>
53-
</exclusion>
54-
</exclusions>
55-
</dependency>
5644
<dependency>
5745
<groupId>org.slf4j</groupId>
5846
<artifactId>slf4j-api</artifactId>

opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java

+29-6
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,41 @@
1818
package opennlp.dl.doccat;
1919

2020
import java.util.Collections;
21+
import java.util.HashMap;
2122
import java.util.Map;
23+
import java.util.Objects;
24+
import java.util.regex.Matcher;
25+
import java.util.regex.Pattern;
2226

23-
public class DocumentCategorizerConfig {
27+
public record DocumentCategorizerConfig(Map<String, String> id2label) {
2428

25-
private Map<String, String> id2label;
29+
private static final Pattern ID_TO_LABEL_PATTERN =
30+
Pattern.compile("\"id2label\"\\s*:\\s*\\{(.*?)\\}", Pattern.DOTALL);
31+
private static final Pattern ENTRY_PATTERN =
32+
Pattern.compile("\"([^\"]+)\"\\s*:\\s*\"(.*?)\"");
2633

27-
public Map<String, String> getId2label() {
34+
@Override
35+
public Map<String, String> id2label() {
2836
return Collections.unmodifiableMap(id2label);
2937
}
3038

31-
public void setId2label(Map<String, String> id2label) {
32-
this.id2label = id2label;
33-
}
39+
public static DocumentCategorizerConfig fromJson(String json) {
40+
Objects.requireNonNull(json, "json must not be null");
41+
42+
final Map<String, String> id2label = new HashMap<>();
43+
final Matcher matcher = ID_TO_LABEL_PATTERN.matcher(json);
44+
45+
if (matcher.find()) {
46+
final String id2labelContent = matcher.group(1);
47+
final Matcher entryMatcher = ENTRY_PATTERN.matcher(id2labelContent);
3448

49+
while (entryMatcher.find()) {
50+
final String key = entryMatcher.group(1);
51+
final String value = entryMatcher.group(2);
52+
id2label.put(key, value);
53+
}
54+
}
55+
56+
return new DocumentCategorizerConfig(id2label);
57+
}
3558
}

opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java

+18-25
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.io.File;
2121
import java.io.IOException;
2222
import java.nio.LongBuffer;
23+
import java.nio.charset.StandardCharsets;
2324
import java.nio.file.Files;
2425
import java.util.Arrays;
2526
import java.util.HashMap;
@@ -36,8 +37,6 @@
3637
import ai.onnxruntime.OrtEnvironment;
3738
import ai.onnxruntime.OrtException;
3839
import ai.onnxruntime.OrtSession;
39-
import com.fasterxml.jackson.databind.DeserializationFeature;
40-
import com.fasterxml.jackson.databind.ObjectMapper;
4140
import org.slf4j.Logger;
4241
import org.slf4j.LoggerFactory;
4342

@@ -67,16 +66,15 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor
6766
/**
6867
* Instantiates a {@link DocumentCategorizer document categorizer} using ONNX models.
6968
*
70-
* @param model The ONNX model file.
71-
* @param vocabulary The model file's vocabulary file.
72-
* @param categories The categories.
69+
* @param model The ONNX model file.
70+
* @param vocabulary The model file's vocabulary file.
71+
* @param categories The categories.
7372
* @param classificationScoringStrategy Implementation of {@link ClassificationScoringStrategy} used
7473
* to calculate the classification scores given the score of each
7574
* individual document part.
76-
* @param inferenceOptions {@link InferenceOptions} to control the inference.
77-
*
75+
* @param inferenceOptions {@link InferenceOptions} to control the inference.
7876
* @throws OrtException Thrown if the {@code model} cannot be loaded.
79-
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
77+
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
8078
*/
8179
public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, String> categories,
8280
ClassificationScoringStrategy classificationScoringStrategy,
@@ -102,21 +100,21 @@ public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, String> c
102100
/**
103101
* Instantiates a {@link DocumentCategorizer document categorizer} using ONNX models.
104102
*
105-
* @param model The ONNX model file.
106-
* @param vocabulary The model file's vocabulary file.
107-
* @param config The model's config file. The file will be used to determine the classification categories.
103+
* @param model The ONNX model file.
104+
* @param vocabulary The model file's vocabulary file.
105+
* @param config The model's config file. The file will be used to
106+
* determine the classification categories.
108107
* @param classificationScoringStrategy Implementation of {@link ClassificationScoringStrategy} used
109108
* to calculate the classification scores given the score of each
110109
* individual document part.
111-
* @param inferenceOptions {@link InferenceOptions} to control the inference.
112-
*
110+
* @param inferenceOptions {@link InferenceOptions} to control the inference.
113111
* @throws OrtException Thrown if the {@code model} cannot be loaded.
114-
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
112+
* @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}.
115113
*/
116114
public DocumentCategorizerDL(File model, File vocabulary, File config,
117115
ClassificationScoringStrategy classificationScoringStrategy,
118116
InferenceOptions inferenceOptions)
119-
throws IOException, OrtException {
117+
throws IOException, OrtException {
120118

121119
this.env = OrtEnvironment.getEnvironment();
122120

@@ -175,7 +173,7 @@ public double[] categorize(String[] strings) {
175173
logger.error("Unload to perform document classification inference", ex);
176174
}
177175

178-
return new double[]{};
176+
return new double[] {};
179177

180178
}
181179

@@ -315,6 +313,7 @@ private List<Tokens> tokenize(final String text) {
315313

316314
/**
317315
* Applies softmax to an array of values.
316+
*
318317
* @param input An array of values.
319318
* @return The output array.
320319
*/
@@ -346,18 +345,12 @@ private int maxIndex(double[] arr) {
346345
}
347346

348347
private Map<Integer, String> readCategoriesFromFile(File config) throws IOException {
349-
350-
final String json = new String(Files.readAllBytes(config.toPath()));
351-
352-
final ObjectMapper objectMapper = new ObjectMapper();
353-
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
354-
355348
final DocumentCategorizerConfig documentCategorizerConfig =
356-
objectMapper.readValue(json, DocumentCategorizerConfig.class);
349+
DocumentCategorizerConfig.fromJson(Files.readString(config.toPath(), StandardCharsets.UTF_8));
357350

358351
final Map<Integer, String> categories = new HashMap<>();
359-
for (final String key : documentCategorizerConfig.getId2label().keySet()) {
360-
categories.put(Integer.valueOf(key), documentCategorizerConfig.getId2label().get(key));
352+
for (final String key : documentCategorizerConfig.id2label().keySet()) {
353+
categories.put(Integer.valueOf(key), documentCategorizerConfig.id2label().get(key));
361354
}
362355

363356
return categories;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package opennlp.dl.doccat;
18+
19+
import java.util.Map;
20+
21+
import org.junit.jupiter.api.Test;
22+
23+
import static org.junit.jupiter.api.Assertions.assertEquals;
24+
import static org.junit.jupiter.api.Assertions.assertNotNull;
25+
26+
27+
public class DocumentCategorizerConfigTest {
28+
29+
@Test
30+
public void testId2LabelsFromJsonPrettyValid() {
31+
final String json = """
32+
{
33+
"_num_labels": 5,
34+
"architectures": [
35+
"BertForSequenceClassification"
36+
],
37+
"attention_probs_dropout_prob": 0.1,
38+
"directionality": "bidi",
39+
"finetuning_task": "sentiment-analysis",
40+
"hidden_act": "gelu",
41+
"hidden_dropout_prob": 0.1,
42+
"hidden_size": 768,
43+
"id2label": {
44+
"0": "1 star",
45+
"1": "2 stars",
46+
"2": "3 stars",
47+
"3": "4 stars",
48+
"4": "5 stars"
49+
},
50+
"initializer_range": 0.02,
51+
"intermediate_size": 3072,
52+
"label2id": {
53+
"1 star": 0,
54+
"2 stars": 1,
55+
"3 stars": 2,
56+
"4 stars": 3,
57+
"5 stars": 4
58+
},
59+
"layer_norm_eps": 1e-12,
60+
"max_position_embeddings": 512,
61+
"model_type": "bert",
62+
"num_attention_heads": 12,
63+
"num_hidden_layers": 12,
64+
"output_past": true,
65+
"pad_token_id": 0,
66+
"pooler_fc_size": 768,
67+
"pooler_num_attention_heads": 12,
68+
"pooler_num_fc_layers": 3,
69+
"pooler_size_per_head": 128,
70+
"pooler_type": "first_token_transform",
71+
"type_vocab_size": 2,
72+
"vocab_size": 105879
73+
}
74+
""";
75+
76+
final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
77+
assertNotNull(config);
78+
final Map<String, String> map = config.id2label();
79+
assertEquals(5, map.size());
80+
assertEquals("1 star", map.get("0"));
81+
assertEquals("2 stars", map.get("1"));
82+
assertEquals("3 stars", map.get("2"));
83+
assertEquals("4 stars", map.get("3"));
84+
assertEquals("5 stars", map.get("4"));
85+
}
86+
87+
@Test
88+
public void testId2LabelsFromJsonUglyValid() {
89+
final String json = """
90+
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_
91+
dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
92+
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"id2label":{"0":"1 star",
93+
"1":"2 stars","2":"3 stars","3":"4 stars","4":"5 stars"},"initializer_range":0.02,
94+
"intermediate_size":3072,"label2id":{"1 star":0,"2 stars":1,"3 stars":2,"4 stars":3,"5
95+
stars":4},"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert",
96+
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"
97+
pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
98+
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,
99+
"vocab_size":105879}
100+
""";
101+
102+
final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
103+
assertNotNull(config);
104+
final Map<String, String> map = config.id2label();
105+
assertEquals(5, map.size());
106+
assertEquals("1 star", map.get("0"));
107+
assertEquals("2 stars", map.get("1"));
108+
assertEquals("3 stars", map.get("2"));
109+
assertEquals("4 stars", map.get("3"));
110+
assertEquals("5 stars", map.get("4"));
111+
}
112+
113+
@Test
114+
public void testId2LabelsFromJsonNoValues() {
115+
final String json = """
116+
{"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs
117+
_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis",
118+
"hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"layer_norm_eps":1e-12,
119+
"max_position_embeddings":512,"model_type":"bert",
120+
"num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,
121+
"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3,
122+
"pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,
123+
"vocab_size":105879}
124+
""";
125+
126+
final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
127+
assertNotNull(config);
128+
assertEquals(0, config.id2label().size());
129+
}
130+
131+
@Test
132+
public void testId2LabelsFromJsonEmptyInput() {
133+
final String json = "";
134+
final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
135+
assertNotNull(config);
136+
assertEquals(0, config.id2label().size());
137+
}
138+
139+
@Test
140+
public void testId2LabelsFromJsonPrettyIdIsNotANumberValid() {
141+
final String json = """
142+
{
143+
"_num_labels": 5,
144+
"architectures": [
145+
"BertForSequenceClassification"
146+
],
147+
"attention_probs_dropout_prob": 0.1,
148+
"directionality": "bidi",
149+
"finetuning_task": "sentiment-analysis",
150+
"hidden_act": "gelu",
151+
"hidden_dropout_prob": 0.1,
152+
"hidden_size": 768,
153+
"id2label": {
154+
"a0": "1 star",
155+
"a1": "2 stars",
156+
"a2": "3 stars",
157+
"a3": "4 stars",
158+
"a4": "5 stars"
159+
},
160+
"initializer_range": 0.02,
161+
"intermediate_size": 3072,
162+
"label2id": {
163+
"1 star": "a0",
164+
"2 stars": "a1",
165+
"3 stars": "a2",
166+
"4 stars": "a3",
167+
"5 stars": "a4"
168+
},
169+
"layer_norm_eps": 1e-12,
170+
"max_position_embeddings": 512,
171+
"model_type": "bert",
172+
"num_attention_heads": 12,
173+
"num_hidden_layers": 12,
174+
"output_past": true,
175+
"pad_token_id": 0,
176+
"pooler_fc_size": 768,
177+
"pooler_num_attention_heads": 12,
178+
"pooler_num_fc_layers": 3,
179+
"pooler_size_per_head": 128,
180+
"pooler_type": "first_token_transform",
181+
"type_vocab_size": 2,
182+
"vocab_size": 105879
183+
}
184+
""";
185+
186+
final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json);
187+
assertNotNull(config);
188+
final Map<String, String> map = config.id2label();
189+
assertEquals(5, map.size());
190+
assertEquals("1 star", map.get("a0"));
191+
assertEquals("2 stars", map.get("a1"));
192+
assertEquals("3 stars", map.get("a2"));
193+
assertEquals("4 stars", map.get("a3"));
194+
assertEquals("5 stars", map.get("a4"));
195+
}
196+
}

0 commit comments

Comments
 (0)