diff --git a/pom.xml b/pom.xml
index 9feab14f..ddcea8c4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -77,7 +77,7 @@
UTF-8
17
17
- 3.0.1
+ 3.0.3
3.0.0
3.5.0
2.4.0
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java
index 9776c364..89e12549 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java
@@ -781,6 +781,7 @@ public static ConcurrentLinkedQueue removeIfInTarget(ConcurrentLinkedQue
addFilesToConcurrentList(targetDir, targetEnding, targetFilePaths);
}
System.out.println("Found " + targetFilePaths.size() + " files in target location");
+ System.out.println("Source location has: " + paths.size());
List cleanList = new ArrayList<>();
if (!targetFilePaths.isEmpty()) {
diff --git a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java
index 9865cf7a..7290d0ef 100644
--- a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java
+++ b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java
@@ -26,6 +26,8 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.GZIPOutputStream;
+import java.util.List;
+import java.util.ArrayList;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
@@ -59,7 +61,16 @@ public void testSpacy() throws Exception {
DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
new DUUIFileReader(
sourceLocation.toString(),
- "xmi.gz"
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
)
);
@@ -108,68 +119,94 @@ public void testSpacy() throws Exception {
@Test
public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException {
- Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_html_Funkmast.csv");
- try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
- long counter = 0;
- boolean skipFirstLine = true;
- String line;
- while ((line = reader.readLine()) != null) {
- try {
- counter += 1;
- if (counter % 50 == 0) {
- System.out.println(counter);
+ List tasks = new ArrayList<>();
+ tasks.add("Gruene-Sosse");
+ tasks.add("Hitzestift");
+ tasks.add("Tetra-Pak");
+ tasks.add("Medizin-Atmung");
+ tasks.add("Medizin-Kreislauf");
+ tasks.add("Medizin-Mittelohr");
+ tasks.add("Nudging-Aufgabe");
+ tasks.add("Piloten-Streik-Aufgabe");
+ tasks.add("Startup-Aufgabe");
+ tasks.add("Start-Up-Aufgabe");
+ tasks.add("Windpark-Aufgabe");
+
+ for (String task : tasks) {
+ Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_html_" + task + "_v2.csv");
+ try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
+ long counter = 0;
+ long countNew = 0;
+ long countExists = 0;
+ boolean skipFirstLine = true;
+ String line;
+ while ((line = reader.readLine()) != null) {
+ try {
+ counter += 1;
+ if (counter % 50 == 0) {
+ System.out.println(counter);
+ }
+
+ if (skipFirstLine) {
+ skipFirstLine = false;
+ continue;
+ }
+
+ line = line.trim();
+ String[] fields = line.split("\t", -1);
+
+ String url = fields[7];
+ if (!url.contains("google.com/search") && !url.contains("google.de/search")) {
+ continue;
+ }
+
+ String user = fields[9];
+ String session = fields[4];
+ String html = fields[10];
+
+ String title = html + ".html.gz";
+ String docId = user + "/" + session + "/" + title;
+ String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/html/";
+ String docBaseUri = collectionId;
+ String docUri = docBaseUri + docId;
+
+ Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html/" + docId);
+ Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html_xmi_google_serps/" + docId + ".xmi.gz");
+ if (Files.exists(output)) {
+ countExists++;
+ continue;
+ }
+
+ JCas jCas = HTMLGoogleSERPLoader.load(filename, null);
+
+ DocumentMetaData dmd = new DocumentMetaData(jCas);
+ dmd.setDocumentTitle(title);
+ dmd.setDocumentId(docId);
+ dmd.setDocumentUri(docUri);
+ dmd.setCollectionId(collectionId);
+ dmd.setDocumentBaseUri(docBaseUri);
+ dmd.addToIndexes();
+
+ Files.createDirectories(output.getParent());
+ try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) {
+ XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true);
+ xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1");
+ xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString());
+ XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
+ xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
+ }
+
+ countNew++;
}
-
- if (skipFirstLine) {
- skipFirstLine = false;
- continue;
- }
-
- line = line.trim();
- String[] fields = line.split(",", -1);
-
- String url = fields[7];
- //if (!url.contains("google.com/search") && !url.contains("google.de/search")) {
- if (!url.contains("google.de/search")) {
- continue;
- }
-
- String user = fields[9];
- String session = fields[4];
- String html = fields[10];
-
- String title = html + ".html.gz";
- String docId = user + "/" + session + "/" + title;
- String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/html/";
- String docBaseUri = collectionId;
- String docUri = docBaseUri + docId;
-
- Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html/" + docId);
- JCas jCas = HTMLGoogleSERPLoader.load(filename, null);
-
- DocumentMetaData dmd = new DocumentMetaData(jCas);
- dmd.setDocumentTitle(title);
- dmd.setDocumentId(docId);
- dmd.setDocumentUri(docUri);
- dmd.setCollectionId(collectionId);
- dmd.setDocumentBaseUri(docBaseUri);
- dmd.addToIndexes();
-
- Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html_xmi_google_serps/" + docId + ".xmi.gz");
- Files.createDirectories(output.getParent());
- try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) {
- XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true);
- xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1");
- xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString());
- XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
- xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
+ catch (Exception e) {
+ e.printStackTrace();
}
+ }
+ System.out.println("Count " + counter);
+ System.out.println(" New: " + countNew);
+ System.out.println(" Exists: " + countExists);
}
- catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
+ }
}
@Test
diff --git a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java
old mode 100644
new mode 100755
index c3883ba7..6ff18683
--- a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java
+++ b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java
@@ -25,7 +25,10 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
import java.util.zip.GZIPOutputStream;
+import java.util.List;
+import java.util.ArrayList;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
@@ -50,17 +53,84 @@ public void testSimple() throws ParserConfigurationException, IOException, UIMAE
}
}
+ @Test
+ public void testSRLC07() throws Exception {
+ Path sourceLocation = Paths.get("/storage/projects/CORE/projects2/B05_C07_Corpus/texts_xmi_1_spacy/");
+ Path targetLocation = Paths.get("/storage/projects/CORE/projects2/B05_C07_Corpus/texts_xmi_2_srl_ht/");
+ int scale = 10;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+ DUUIPipelineComponent componentLang = new DUUISwarmDriver
+ .Component("docker.texttechnologylab.org/srl_cuda_1024:latest")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build();
+ composer.add(componentLang);
+
+ DUUIPipelineComponent componentSpacy = new DUUISwarmDriver
+ .Component("docker.texttechnologylab.org/heideltime_ext:0.3")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build();
+ composer.add(componentSpacy);
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "srl_ht");
+ composer.shutdown();
+ }
+
@Test
public void testSpacy() throws Exception {
Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi");
Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
- int scale = 10;
+ int scale = 50;
DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
new DUUIFileReader(
sourceLocation.toString(),
- "xmi.gz"
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
)
);
@@ -84,7 +154,8 @@ public void testSpacy() throws Exception {
DUUISegmentationStrategyByAnnotation strategy = new DUUISegmentationStrategyByAnnotation()
.withSegmentationClass(Paragraph.class)
.withMaxAnnotationsPerSegment(1)
- .withMaxCharsPerSegment(1000000);
+ .withMaxCharsPerSegment(1000000)
+ .withPrintStatistics(false);
DUUIPipelineComponent componentSpacy = new DUUISwarmDriver.Component("docker.texttechnologylab.org/duui-spacy:0.4.3")
//DUUIPipelineComponent componentSpacy = new DUUISwarmDriver.Component("docker.texttechnologylab.org/duui-spacy-de_core_news_lg:0.4.1")
@@ -107,17 +178,20 @@ public void testSpacy() throws Exception {
}
@Test
- public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException {
- Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_texts_Nudging-Aufgabe.csv");
- try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
- long counter = 0;
- boolean skipFirstLine = true;
- String line;
- while ((line = reader.readLine()) != null) {
+ public void testReaderAlle() throws ParserConfigurationException, IOException, UIMAException, SAXException {
+ Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/texts_t0_2024_07_29.csv");
+ System.out.println(listFile);
+ try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
+ long countNew = 0;
+ long countExists = 0;
+ long counter = 0;
+ boolean skipFirstLine = true;
+ String line;
+ while ((line = reader.readLine()) != null) {
try {
counter += 1;
- if (counter % 50 == 0) {
- System.out.println(counter);
+ if (counter % 1000 == 0) {
+ System.out.println("C07: " + counter);
}
if (skipFirstLine) {
@@ -126,11 +200,14 @@ public void testReader2() throws ParserConfigurationException, IOException, UIMA
}
line = line.trim();
- String[] fields = line.split(",", -1);
+ String[] fields = line.split("\t", -1);
- String user = fields[9];
- String session = fields[4];
- String html = fields[10];
+ String user = fields[12];
+ if (user.startsWith("\"")) {
+ user = user.substring(1, user.length()-1);
+ }
+ String session = fields[9];
+ String html = fields[14];
String title = html + ".html.gz";
String docId = user + "/" + session + "/" + title;
@@ -140,8 +217,10 @@ public void testReader2() throws ParserConfigurationException, IOException, UIMA
Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts/" + docId);
Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi/" + docId + ".xmi.gz");
+ System.out.println(output);
if (Files.exists(output)) {
System.out.println("exists: " + output.toString());
+ countExists++;
continue;
}
@@ -163,12 +242,242 @@ public void testReader2() throws ParserConfigurationException, IOException, UIMA
XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
}
+
+ countNew++;
}
catch (Exception e) {
e.printStackTrace();
}
- }
- }
+ }
+
+ System.out.println("Count " + counter);
+ System.out.println(" New: " + countNew);
+ System.out.println(" Exists: " + countExists);
+ }
+ }
+ @Test
+ public void testExportC07() throws ParserConfigurationException, IOException, UIMAException, SAXException {
+ Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/assessment_urls_texts_4_c07.csv");
+ System.out.println(listFile);
+ try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
+ long countOk = 0;
+ long countError = 0;
+ long counter = 0;
+ boolean skipFirstLine = true;
+ String line;
+ while ((line = reader.readLine()) != null) {
+ try {
+ if (skipFirstLine) {
+ skipFirstLine = false;
+ continue;
+ }
+
+ counter += 1;
+ if (counter % 1000 == 0) {
+ System.out.println("C07: " + counter);
+ }
+
+ line = line.trim();
+ String[] fields = line.split("\t", -1);
+
+ String user = fields[11];
+ String session = fields[6];
+ String html = fields[12];
+
+ String title = html + ".html.gz.xmi.gz";
+ String docId = user + "/" + session + "/" + title;
+ Path input = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy/" + docId);
+ //System.out.println(input);
+
+ if (Files.exists(input)) {
+ countOk++;
+
+ String docIdOut = user + "_" + session + "_" + title;
+ Path output = Paths.get("/storage/projects/CORE/projects2/B05_C07_Corpus/texts_xmi_1_spacy/" + docIdOut);
+ if (!Files.exists(output)) {
+ //System.out.println(output);
+ Files.copy(input, output, StandardCopyOption.COPY_ATTRIBUTES);
+ }
+ }
+ else {
+ countError++;
+ }
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("Count : " + counter);
+ System.out.println(" Ok: " + countOk);
+ System.out.println(" Error: " + countError);
+ }
+ }
+
+ @Test
+ public void testReaderC07() throws ParserConfigurationException, IOException, UIMAException, SAXException {
+ Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/assessment_urls_texts_4_c07.csv");
+ System.out.println(listFile);
+ try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
+ long countNew = 0;
+ long countExists = 0;
+ long counter = 0;
+ boolean skipFirstLine = true;
+ String line;
+ while ((line = reader.readLine()) != null) {
+ try {
+ counter += 1;
+ if (counter % 1000 == 0) {
+ System.out.println("C07: " + counter);
+ }
+
+ if (skipFirstLine) {
+ skipFirstLine = false;
+ continue;
+ }
+
+ line = line.trim();
+ String[] fields = line.split("\t", -1);
+
+ String user = fields[11];
+ String session = fields[6];
+ String html = fields[12];
+
+ String title = html + ".html.gz";
+ String docId = user + "/" + session + "/" + title;
+ String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/texts/";
+ String docBaseUri = collectionId;
+ String docUri = docBaseUri + docId;
+
+ Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts/" + docId);
+ Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi/" + docId + ".xmi.gz");
+ System.out.println(output);
+ if (Files.exists(output)) {
+ System.out.println("exists: " + output.toString());
+ countExists++;
+ continue;
+ }
+
+ JCas jCas = HTMLReadabilityLoader.load(filename, null);
+
+ DocumentMetaData dmd = new DocumentMetaData(jCas);
+ dmd.setDocumentTitle(title);
+ dmd.setDocumentId(docId);
+ dmd.setDocumentUri(docUri);
+ dmd.setCollectionId(collectionId);
+ dmd.setDocumentBaseUri(docBaseUri);
+ dmd.addToIndexes();
+
+ Files.createDirectories(output.getParent());
+ try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) {
+ XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true);
+ xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1");
+ xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString());
+ XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
+ xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
+ }
+
+ countNew++;
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("Count " + counter);
+ System.out.println(" New: " + countNew);
+ System.out.println(" Exists: " + countExists);
+ }
+ }
+
+ @Test
+ public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException {
+ List tasks = new ArrayList();
+ tasks.add("Start-Up-Aufgabe");
+ /*tasks.add("Medizin-Kreislauf");
+ tasks.add("Medizin-Mittelohr");
+ tasks.add("Piloten-Streik-Aufgabe");
+ tasks.add("Startup-Aufgabe");
+ tasks.add("Windpark-Aufgabe");
+ tasks.add("Nudging-Aufgabe");
+ tasks.add("Medizin-Atmung");
+ tasks.add("Hitzestift");
+ tasks.add("Gruene-Sosse");
+ tasks.add("Tetra-Pak");*/
+
+ for (String task : tasks) {
+ Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_texts_" + task + "_v2.csv");
+ System.out.println(listFile);
+ try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) {
+ long countNew = 0;
+ long countExists = 0;
+ long counter = 0;
+ boolean skipFirstLine = true;
+ String line;
+ while ((line = reader.readLine()) != null) {
+ try {
+ counter += 1;
+ if (counter % 1000 == 0) {
+ System.out.println(task + ": " + counter);
+ }
+
+ if (skipFirstLine) {
+ skipFirstLine = false;
+ continue;
+ }
+
+ line = line.trim();
+ String[] fields = line.split("\t", -1);
+
+ String user = fields[9];
+ String session = fields[4];
+ String html = fields[10];
+
+ String title = html + ".html.gz";
+ String docId = user + "/" + session + "/" + title;
+ String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/texts/";
+ String docBaseUri = collectionId;
+ String docUri = docBaseUri + docId;
+
+ Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts/" + docId);
+ Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi/" + docId + ".xmi.gz");
+ if (Files.exists(output)) {
+ //System.out.println("exists: " + output.toString());
+ countExists++;
+ continue;
+ }
+
+ JCas jCas = HTMLReadabilityLoader.load(filename, null);
+
+ DocumentMetaData dmd = new DocumentMetaData(jCas);
+ dmd.setDocumentTitle(title);
+ dmd.setDocumentId(docId);
+ dmd.setDocumentUri(docUri);
+ dmd.setCollectionId(collectionId);
+ dmd.setDocumentBaseUri(docBaseUri);
+ dmd.addToIndexes();
+
+ Files.createDirectories(output.getParent());
+ try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) {
+ XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true);
+ xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1");
+ xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString());
+ XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
+ xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler());
+ }
+
+ countNew++;
+ }
+ catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("Count " + counter);
+ System.out.println(" New: " + countNew);
+ System.out.println(" Exists: " + countExists);
+ }
+ }
}
@Test
@@ -200,4 +509,587 @@ public void testReader() throws Exception {
composer.run(reader, "readability_html");
}
+
+
+ @Test
+ public void testDDC2() throws Exception {
+ // DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/topic_ddc2_100");
+ int scale = 20;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String ddcVariant = "ddc2_dim100";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/textimager-duui-ddc-fasttext:latest")
+ .withParameter("ddc_variant", ddcVariant)
+ .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+ @Test
+ public void testTopicCardiffnlp() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/topic_cardiffnlp");
+ int scale = 2;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "cardiffnlp/tweet-topic-latest-multi";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-topic:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+ @Test
+ public void testGenreClassla() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/genre_classla");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ DUUIKubernetesDriver kubernetesDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver, kubernetesDriver);
+
+
+ String model = "";
+ model = "classla/xlm-roberta-base-multilingual-text-genre-classifier";
+ //omposer.add(new DUUISwarmDriver.
+ composer.add(new DUUIKubernetesDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-topic:latest")
+ .withParameter("model_name", model)
+ // .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .withLabels("hostname=isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+/*
+ @Test
+ public void testGenreClasslaSents() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/genre_classla_sents");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "";
+ model = "classla/xlm-roberta-base-multilingual-text-genre-classifier";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-topic:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+
+ @Test
+ public void testSentimentCardiffnlp() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/sentiment_cardiffnlp");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "";
+ model = "cardiffnlp/twitter-xlm-roberta-base-sentiment";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-sentiment:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+
+ @Test
+ public void testSentimentNLPTown() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/sentiment_nlptown");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "";
+ model = "nlptown/bert-base-multilingual-uncased-sentiment";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-sentiment:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+ @Test
+ public void testSentimentVader() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/sentiment_vader");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/textimager-duui-vader-sentiment:latest")
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+
+ @Test
+ public void testToxicCitizenlab() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/toxicity_citizenlab");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "";
+ model = "citizenlab/distilbert-base-multilingual-cased-toxicity";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-toxic:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+
+ @Test
+ public void testToxicEIS() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/toxicity_eis");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "";
+ model = "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-toxic:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }
+
+
+ @Test
+ public void testToxicEIS() throws Exception {
+ // (NOT) DONE
+ Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy");
+ Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/toxicity_FredZhang7");
+ int scale = 1;
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(
+ new DUUIFileReader(
+ sourceLocation.toString(),
+ "html.gz.xmi.gz",
+ 1,
+ -1,
+ false,
+ "",
+ false,
+ null,
+ -1,
+ targetLocation.toString(),
+ "html.gz.xmi.gz"
+ )
+ );
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withWorkers(scale)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver();
+ DUUISwarmDriver swarmDriver = new DUUISwarmDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+ composer.addDriver(uimaDriver, swarmDriver, dockerDriver);
+
+
+ String model = "";
+ model = "FredZhang7/one-for-all-toxicity-v3";
+ composer.add(new DUUISwarmDriver.
+ Component("docker.texttechnologylab.org/duui-transformers-toxic:latest")
+ .withParameter("model_name", model)
+ .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
+ // .withParameter("selection", "text")
+ .withScale(scale)
+ .withConstraintHost("isengart")
+ .build()
+ );
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(),
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"
+ )).build());
+
+ composer.run(processor, "spacy_plus");
+ composer.shutdown();
+ }*/
}