Skip to content

Commit e91ceb1

Browse files
authored
OPENNLP-1661: Fix custom models being wiped from OpenNLP user.home directory (#704)
- deletes AbstractDownloadUtilTest.java removing historical code that wiped models - adds package-private DownloadUtil#existsModel(..) method to check models for certain language exist locally - adds to and adjusts related test classes
1 parent f4de6c2 commit e91ceb1

File tree

4 files changed

+90
-109
lines changed

4 files changed

+90
-109
lines changed

opennlp-tools/src/main/java/opennlp/tools/util/DownloadUtil.java

+49-11
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,45 @@ public enum ModelType {
7777
System.getProperty("OPENNLP_DOWNLOAD_BASE_URL", "https://dlcdn.apache.org/opennlp/");
7878
private static final String MODEL_URI_PATH =
7979
System.getProperty("OPENNLP_DOWNLOAD_MODEL_PATH", "models/ud-models-1.2/");
80+
private static final String OPENNLP_DOWNLOAD_HOME = "OPENNLP_DOWNLOAD_HOME";
8081

8182
private static Map<String, Map<ModelType, String>> availableModels;
8283

84+
/**
85+
* Checks if a model of the specified {@code modelType} has been downloaded already
86+
* for a particular {@code language}.
87+
*
88+
* @param language The ISO language code of the requested model.
89+
* @param modelType The {@link DownloadUtil.ModelType type} of model.
90+
* @return {@code true} if a model exists locally, {@code false} otherwise.
91+
* @throws IOException Thrown if IO errors occurred or the computed hash sum
92+
* of an associated, local model file was incorrect.
93+
*/
94+
static boolean existsModel(String language, ModelType modelType) throws IOException {
95+
Map<ModelType, String> modelsByLanguage = getAvailableModels().get(language);
96+
if (modelsByLanguage == null) {
97+
return false;
98+
} else {
99+
final String url = modelsByLanguage.get(modelType);
100+
if (url != null) {
101+
final Path homeDirectory = getDownloadHome();
102+
final String filename = url.substring(url.lastIndexOf("/") + 1);
103+
final Path localFile = homeDirectory.resolve(filename);
104+
boolean exists;
105+
if (Files.exists(localFile)) {
106+
// if this does not throw the requested model is valid!
107+
validateModel(new URL(url + ".sha512"), localFile);
108+
exists = true;
109+
} else {
110+
exists = false;
111+
}
112+
return exists;
113+
} else {
114+
return false;
115+
}
116+
}
117+
}
118+
83119
/**
84120
* Triggers a download for the specified {@link DownloadUtil.ModelType}.
85121
*
@@ -94,7 +130,7 @@ public static <T extends BaseModel> T downloadModel(String language, ModelType m
94130
Class<T> type) throws IOException {
95131

96132
if (getAvailableModels().containsKey(language)) {
97-
final String url = (getAvailableModels().get(language).get(modelType));
133+
final String url = getAvailableModels().get(language).get(modelType);
98134
if (url != null) {
99135
return downloadModel(new URL(url), type);
100136
}
@@ -119,8 +155,7 @@ public static <T extends BaseModel> T downloadModel(String language, ModelType m
119155
*/
120156
public static <T extends BaseModel> T downloadModel(URL url, Class<T> type) throws IOException {
121157

122-
final Path homeDirectory = Paths.get(System.getProperty("OPENNLP_DOWNLOAD_HOME",
123-
System.getProperty("user.home"))).resolve(".opennlp");
158+
final Path homeDirectory = getDownloadHome();
124159

125160
if (!Files.isDirectory(homeDirectory)) {
126161
try {
@@ -131,20 +166,17 @@ public static <T extends BaseModel> T downloadModel(URL url, Class<T> type) thro
131166
}
132167

133168
final String filename = url.toString().substring(url.toString().lastIndexOf("/") + 1);
134-
final Path localFile = Paths.get(homeDirectory.toString(), filename);
169+
final Path localFile = homeDirectory.resolve(filename);
135170

136171
if (!Files.exists(localFile)) {
137-
logger.debug("Downloading model from {} to {}.", url, localFile);
172+
logger.debug("Downloading model to {}.", localFile);
138173

139174
try (final InputStream in = url.openStream()) {
140175
Files.copy(in, localFile, StandardCopyOption.REPLACE_EXISTING);
141176
}
142-
143177
validateModel(new URL(url + ".sha512"), localFile);
144-
145178
logger.debug("Download complete.");
146179
} else {
147-
System.out.println("Model file already exists. Skipping download.");
148180
logger.debug("Model file '{}' already exists. Skipping download.", filename);
149181
}
150182

@@ -167,7 +199,7 @@ public static Map<String, Map<ModelType, String>> getAvailableModels() {
167199
}
168200

169201
/**
170-
* Validates the downloaded model.
202+
* Validates a downloaded model via the specified {@link Path downloadedModel path}.
171203
*
172204
* @param sha512 the url to get the sha512 hash
173205
* @param downloadedModel the model file to check
@@ -187,8 +219,8 @@ private static void validateModel(URL sha512, Path downloadedModel) throws IOExc
187219
// Validate SHA512 checksum
188220
final String actualChecksum = calculateSHA512(downloadedModel);
189221
if (!actualChecksum.equalsIgnoreCase(expectedChecksum)) {
190-
throw new IOException("SHA512 checksum validation failed. Expected: "
191-
+ expectedChecksum + ", but got: " + actualChecksum);
222+
throw new IOException("SHA512 checksum validation failed for " + downloadedModel.getFileName() +
223+
". Expected: " + expectedChecksum + ", but got: " + actualChecksum);
192224
}
193225
}
194226

@@ -198,6 +230,7 @@ private static String calculateSHA512(Path file) throws IOException {
198230
try (InputStream fis = Files.newInputStream(file);
199231
DigestInputStream dis = new DigestInputStream(fis, digest)) {
200232
byte[] buffer = new byte[4096];
233+
//noinspection StatementWithEmptyBody
201234
while (dis.read(buffer) != -1) {
202235
// Reading the file to update the digest
203236
}
@@ -217,6 +250,11 @@ private static String byteArrayToHexString(byte[] bytes) {
217250
}
218251
}
219252

253+
private static Path getDownloadHome() {
254+
return Paths.get(System.getProperty(OPENNLP_DOWNLOAD_HOME,
255+
System.getProperty("user.home"))).resolve(".opennlp");
256+
}
257+
220258
@Internal
221259
static class DownloadParser {
222260

opennlp-tools/src/test/java/opennlp/tools/util/AbstractDownloadUtilTest.java

-79
This file was deleted.

opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilDownloadTwiceTest.java

+15-15
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
import static org.junit.jupiter.api.Assertions.assertEquals;
3636

3737
@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
38-
public class DownloadUtilDownloadTwiceTest extends AbstractDownloadUtilTest {
38+
public class DownloadUtilDownloadTwiceTest {
3939

4040
/*
4141
* Programmatic change to debug log to ensure that we can see log messages to
@@ -60,24 +60,24 @@ public static void cleanup() {
6060

6161
@Test
6262
public void testDownloadModelTwice() throws IOException {
63+
String lang = "de";
64+
DownloadUtil.ModelType type = DownloadUtil.ModelType.SENTENCE_DETECTOR;
65+
6366
try (LogCaptor logCaptor = LogCaptor.forClass(DownloadUtil.class)) {
64-
65-
DownloadUtil.downloadModel("de",
66-
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
67-
68-
assertEquals(2, logCaptor.getDebugLogs().size());
69-
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download complete.");
67+
boolean alreadyDownloaded = DownloadUtil.existsModel(lang, type);
68+
DownloadUtil.downloadModel(lang, type, SentenceModel.class);
69+
70+
if (! alreadyDownloaded) {
71+
assertEquals(2, logCaptor.getDebugLogs().size());
72+
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "Download complete.");
73+
} else {
74+
assertEquals(1, logCaptor.getDebugLogs().size());
75+
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download.");
76+
}
7077
logCaptor.clearLogs();
7178

7279
// try to download again
73-
DownloadUtil.downloadModel("de",
74-
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
75-
assertEquals(1, logCaptor.getDebugLogs().size());
76-
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download.");
77-
logCaptor.clearLogs();
78-
79-
DownloadUtil.downloadModel("de",
80-
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
80+
DownloadUtil.downloadModel(lang, type, SentenceModel.class);
8181
assertEquals(1, logCaptor.getDebugLogs().size());
8282
checkDebugLogsContainMessageFragment(logCaptor.getDebugLogs(), "already exists. Skipping download.");
8383
logCaptor.clearLogs();

opennlp-tools/src/test/java/opennlp/tools/util/DownloadUtilTest.java

+26-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.net.URL;
2222
import java.util.stream.Stream;
2323

24+
import org.junit.jupiter.api.Test;
2425
import org.junit.jupiter.params.ParameterizedTest;
2526
import org.junit.jupiter.params.provider.Arguments;
2627
import org.junit.jupiter.params.provider.MethodSource;
@@ -32,11 +33,12 @@
3233
import opennlp.tools.tokenize.TokenizerModel;
3334

3435
import static org.junit.jupiter.api.Assertions.assertEquals;
36+
import static org.junit.jupiter.api.Assertions.assertFalse;
3537
import static org.junit.jupiter.api.Assertions.assertNotNull;
3638
import static org.junit.jupiter.api.Assertions.assertThrows;
3739
import static org.junit.jupiter.api.Assertions.assertTrue;
3840

39-
public class DownloadUtilTest extends AbstractDownloadUtilTest {
41+
public class DownloadUtilTest {
4042

4143
@ParameterizedTest(name = "Verify \"{0}\" sentence model")
4244
@ValueSource(strings = {"en", "fr", "de", "it", "nl", "bg", "ca", "cs", "da", "el",
@@ -62,13 +64,33 @@ public void testDownloadModelByURL(String language, URL url) throws IOException
6264
assertTrue(model.isLoadedFromSerialized());
6365
}
6466

67+
@Test
68+
@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
69+
public void testExistsModel() throws IOException {
70+
final String lang = "en";
71+
final DownloadUtil.ModelType type = DownloadUtil.ModelType.SENTENCE_DETECTOR;
72+
// Prepare
73+
SentenceModel model = DownloadUtil.downloadModel(lang, type, SentenceModel.class);
74+
assertNotNull(model);
75+
assertEquals(lang, model.getLanguage());
76+
// Test
77+
assertTrue(DownloadUtil.existsModel(lang, type));
78+
}
79+
80+
@ParameterizedTest
81+
@NullAndEmptySource
82+
@ValueSource(strings = {"xy", "\t", "\n"})
83+
@EnabledWhenCDNAvailable(hostname = "dlcdn.apache.org")
84+
public void testExistsModelInvalid(String input) throws IOException {
85+
assertFalse(DownloadUtil.existsModel(input, DownloadUtil.ModelType.SENTENCE_DETECTOR));
86+
}
87+
6588
@ParameterizedTest(name = "Detect invalid input: \"{0}\"")
6689
@NullAndEmptySource
6790
@ValueSource(strings = {" ", "\t", "\n"})
6891
public void testDownloadModelInvalid(String input) {
69-
assertThrows(IOException.class, () -> DownloadUtil.downloadModel(
70-
input, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class),
71-
"Invalid model");
92+
assertThrows(IOException.class, () -> DownloadUtil.downloadModel(input,
93+
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class), "Invalid model");
7294
}
7395

7496
private static final DownloadUtil.ModelType MT_TOKENIZER = DownloadUtil.ModelType.TOKENIZER;

0 commit comments

Comments
 (0)