From ac396dc644261426701e2121b31e57b917ab5679 Mon Sep 17 00:00:00 2001 From: Daniel Baumartz Date: Wed, 7 Aug 2024 11:55:15 +0200 Subject: [PATCH] readability --- pom.xml | 2 +- .../io/AsyncCollectionReader.java | 1 + .../TestGoogleSERPReader.java | 157 +-- .../TestReadabilityReader.java | 928 +++++++++++++++++- 4 files changed, 1009 insertions(+), 79 deletions(-) mode change 100644 => 100755 src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java diff --git a/pom.xml b/pom.xml index 9feab14f..ddcea8c4 100644 --- a/pom.xml +++ b/pom.xml @@ -77,7 +77,7 @@ UTF-8 17 17 - 3.0.1 + 3.0.3 3.0.0 3.5.0 2.4.0 diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java index 8496d3c6..2ff31153 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/AsyncCollectionReader.java @@ -800,6 +800,7 @@ public static ConcurrentLinkedQueue removeIfInTarget(ConcurrentLinkedQue addFilesToConcurrentList(targetDir, targetEnding, targetFilePaths); } System.out.println("Found " + targetFilePaths.size() + " files in target location"); + System.out.println("Source location has: " + paths.size()); List cleanList = new ArrayList<>(); if (!targetFilePaths.isEmpty()) { diff --git a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java index 9865cf7a..7290d0ef 100644 --- a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java +++ b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestGoogleSERPReader.java @@ -26,6 +26,8 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.zip.GZIPOutputStream; +import java.util.List; +import java.util.ArrayList; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -59,7 +61,16 @@ public void testSpacy() throws Exception { DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( new DUUIFileReader( sourceLocation.toString(), - "xmi.gz" + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" ) ); @@ -108,68 +119,94 @@ public void testSpacy() throws Exception { @Test public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException { - Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_html_Funkmast.csv"); - try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { - long counter = 0; - boolean skipFirstLine = true; - String line; - while ((line = reader.readLine()) != null) { - try { - counter += 1; - if (counter % 50 == 0) { - System.out.println(counter); + List tasks = new ArrayList<>(); + tasks.add("Gruene-Sosse"); + tasks.add("Hitzestift"); + tasks.add("Tetra-Pak"); + tasks.add("Medizin-Atmung"); + tasks.add("Medizin-Kreislauf"); + tasks.add("Medizin-Mittelohr"); + tasks.add("Nudging-Aufgabe"); + tasks.add("Piloten-Streik-Aufgabe"); + tasks.add("Startup-Aufgabe"); + tasks.add("Start-Up-Aufgabe"); + tasks.add("Windpark-Aufgabe"); + + for (String task : tasks) { + Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_html_" + task + "_v2.csv"); + try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { + long counter = 0; + long countNew = 0; + long countExists = 0; + boolean skipFirstLine = true; + String line; + while ((line = reader.readLine()) != null) { + try { + counter += 1; + if (counter % 50 == 0) { + System.out.println(counter); + } + + if (skipFirstLine) { + skipFirstLine = false; + continue; + } + + line = line.trim(); + String[] fields = line.split("\t", -1); + + String url = fields[7]; + if (!url.contains("google.com/search") && !url.contains("google.de/search")) { + continue; + } + + String user = fields[9]; + String session = fields[4]; + String html = fields[10]; + + String title = html + ".html.gz"; + String docId = user + "/" + session + "/" + title; + String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/html/"; + String docBaseUri = collectionId; + String docUri = docBaseUri + docId; + + Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html/" + docId); + Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html_xmi_google_serps/" + docId + ".xmi.gz"); + if (Files.exists(output)) { + countExists++; + continue; + } + + JCas jCas = HTMLGoogleSERPLoader.load(filename, null); + + DocumentMetaData dmd = new DocumentMetaData(jCas); + dmd.setDocumentTitle(title); + dmd.setDocumentId(docId); + dmd.setDocumentUri(docUri); + dmd.setCollectionId(collectionId); + dmd.setDocumentBaseUri(docBaseUri); + dmd.addToIndexes(); + + Files.createDirectories(output.getParent()); + try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) { + XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true); + xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1"); + xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString()); + XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); + xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler()); + } + + countNew++; } - - if (skipFirstLine) { - skipFirstLine = false; - continue; - } - - line = line.trim(); - String[] fields = line.split(",", -1); - - String url = fields[7]; - //if (!url.contains("google.com/search") && !url.contains("google.de/search")) { - if (!url.contains("google.de/search")) { - continue; - } - - String user = fields[9]; - String session = fields[4]; - String html = fields[10]; - - String title = html + ".html.gz"; - String docId = user + "/" + session + "/" + title; - String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/html/"; - String docBaseUri = collectionId; - String docUri = docBaseUri + docId; - - Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html/" + docId); - JCas jCas = HTMLGoogleSERPLoader.load(filename, null); - - DocumentMetaData dmd = new DocumentMetaData(jCas); - dmd.setDocumentTitle(title); - dmd.setDocumentId(docId); - dmd.setDocumentUri(docUri); - dmd.setCollectionId(collectionId); - dmd.setDocumentBaseUri(docBaseUri); - dmd.addToIndexes(); - - Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/html_xmi_google_serps/" + docId + ".xmi.gz"); - Files.createDirectories(output.getParent()); - try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) { - XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true); - xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1"); - xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString()); - XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); - xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler()); + catch (Exception e) { + e.printStackTrace(); } + } + System.out.println("Count " + counter); + System.out.println(" New: " + countNew); + System.out.println(" Exists: " + countExists); } - catch (Exception e) { - e.printStackTrace(); - } - } - } + } } @Test diff --git a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java old mode 100644 new mode 100755 index c3883ba7..6ff18683 --- a/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java +++ b/src/test/java/org/texttechnologylab/DockerUnifiedUIMAInterface/TestReadabilityReader.java @@ -25,7 +25,10 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; import java.util.zip.GZIPOutputStream; +import java.util.List; +import java.util.ArrayList; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; @@ -50,17 +53,84 @@ public void testSimple() throws ParserConfigurationException, IOException, UIMAE } } + @Test + public void testSRLC07() throws Exception { + Path sourceLocation = Paths.get("/storage/projects/CORE/projects2/B05_C07_Corpus/texts_xmi_1_spacy/"); + Path targetLocation = Paths.get("/storage/projects/CORE/projects2/B05_C07_Corpus/texts_xmi_2_srl_ht/"); + int scale = 10; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + DUUIPipelineComponent componentLang = new DUUISwarmDriver + .Component("docker.texttechnologylab.org/srl_cuda_1024:latest") + .withScale(scale) + .withConstraintHost("isengart") + .build(); + composer.add(componentLang); + + DUUIPipelineComponent componentSpacy = new DUUISwarmDriver + .Component("docker.texttechnologylab.org/heideltime_ext:0.3") + .withScale(scale) + .withConstraintHost("isengart") + .build(); + composer.add(componentSpacy); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "srl_ht"); + composer.shutdown(); + } + @Test public void testSpacy() throws Exception { Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi"); Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); - int scale = 10; + int scale = 50; DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( new DUUIFileReader( sourceLocation.toString(), - "xmi.gz" + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" ) ); @@ -84,7 +154,8 @@ public void testSpacy() throws Exception { DUUISegmentationStrategyByAnnotation strategy = new DUUISegmentationStrategyByAnnotation() .withSegmentationClass(Paragraph.class) .withMaxAnnotationsPerSegment(1) - .withMaxCharsPerSegment(1000000); + .withMaxCharsPerSegment(1000000) + .withPrintStatistics(false); DUUIPipelineComponent componentSpacy = new DUUISwarmDriver.Component("docker.texttechnologylab.org/duui-spacy:0.4.3") //DUUIPipelineComponent componentSpacy = new DUUISwarmDriver.Component("docker.texttechnologylab.org/duui-spacy-de_core_news_lg:0.4.1") @@ -107,17 +178,20 @@ public void testSpacy() throws Exception { } @Test - public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException { - Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_texts_Nudging-Aufgabe.csv"); - try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { - long counter = 0; - boolean skipFirstLine = true; - String line; - while ((line = reader.readLine()) != null) { + public void testReaderAlle() throws ParserConfigurationException, IOException, UIMAException, SAXException { + Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/texts_t0_2024_07_29.csv"); + System.out.println(listFile); + try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { + long countNew = 0; + long countExists = 0; + long counter = 0; + boolean skipFirstLine = true; + String line; + while ((line = reader.readLine()) != null) { try { counter += 1; - if (counter % 50 == 0) { - System.out.println(counter); + if (counter % 1000 == 0) { + System.out.println("C07: " + counter); } if (skipFirstLine) { @@ -126,11 +200,14 @@ public void testReader2() throws ParserConfigurationException, IOException, UIMA } line = line.trim(); - String[] fields = line.split(",", -1); + String[] fields = line.split("\t", -1); - String user = fields[9]; - String session = fields[4]; - String html = fields[10]; + String user = fields[12]; + if (user.startsWith("\"")) { + user = user.substring(1, user.length()-1); + } + String session = fields[9]; + String html = fields[14]; String title = html + ".html.gz"; String docId = user + "/" + session + "/" + title; @@ -140,8 +217,10 @@ public void testReader2() throws ParserConfigurationException, IOException, UIMA Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts/" + docId); Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi/" + docId + ".xmi.gz"); + System.out.println(output); if (Files.exists(output)) { System.out.println("exists: " + output.toString()); + countExists++; continue; } @@ -163,12 +242,242 @@ public void testReader2() throws ParserConfigurationException, IOException, UIMA XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler()); } + + countNew++; } catch (Exception e) { e.printStackTrace(); } - } - } + } + + System.out.println("Count " + counter); + System.out.println(" New: " + countNew); + System.out.println(" Exists: " + countExists); + } + } + @Test + public void testExportC07() throws ParserConfigurationException, IOException, UIMAException, SAXException { + Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/assessment_urls_texts_4_c07.csv"); + System.out.println(listFile); + try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { + long countOk = 0; + long countError = 0; + long counter = 0; + boolean skipFirstLine = true; + String line; + while ((line = reader.readLine()) != null) { + try { + if (skipFirstLine) { + skipFirstLine = false; + continue; + } + + counter += 1; + if (counter % 1000 == 0) { + System.out.println("C07: " + counter); + } + + line = line.trim(); + String[] fields = line.split("\t", -1); + + String user = fields[11]; + String session = fields[6]; + String html = fields[12]; + + String title = html + ".html.gz.xmi.gz"; + String docId = user + "/" + session + "/" + title; + Path input = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy/" + docId); + //System.out.println(input); + + if (Files.exists(input)) { + countOk++; + + String docIdOut = user + "_" + session + "_" + title; + Path output = Paths.get("/storage/projects/CORE/projects2/B05_C07_Corpus/texts_xmi_1_spacy/" + docIdOut); + if (!Files.exists(output)) { + //System.out.println(output); + Files.copy(input, output, StandardCopyOption.COPY_ATTRIBUTES); + } + } + else { + countError++; + } + } + catch (Exception e) { + e.printStackTrace(); + } + } + + System.out.println("Count : " + counter); + System.out.println(" Ok: " + countOk); + System.out.println(" Error: " + countError); + } + } + + @Test + public void testReaderC07() throws ParserConfigurationException, IOException, UIMAException, SAXException { + Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/assessment_urls_texts_4_c07.csv"); + System.out.println(listFile); + try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { + long countNew = 0; + long countExists = 0; + long counter = 0; + boolean skipFirstLine = true; + String line; + while ((line = reader.readLine()) != null) { + try { + counter += 1; + if (counter % 1000 == 0) { + System.out.println("C07: " + counter); + } + + if (skipFirstLine) { + skipFirstLine = false; + continue; + } + + line = line.trim(); + String[] fields = line.split("\t", -1); + + String user = fields[11]; + String session = fields[6]; + String html = fields[12]; + + String title = html + ".html.gz"; + String docId = user + "/" + session + "/" + title; + String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/texts/"; + String docBaseUri = collectionId; + String docUri = docBaseUri + docId; + + Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts/" + docId); + Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi/" + docId + ".xmi.gz"); + System.out.println(output); + if (Files.exists(output)) { + System.out.println("exists: " + output.toString()); + countExists++; + continue; + } + + JCas jCas = HTMLReadabilityLoader.load(filename, null); + + DocumentMetaData dmd = new DocumentMetaData(jCas); + dmd.setDocumentTitle(title); + dmd.setDocumentId(docId); + dmd.setDocumentUri(docUri); + dmd.setCollectionId(collectionId); + dmd.setDocumentBaseUri(docBaseUri); + dmd.addToIndexes(); + + Files.createDirectories(output.getParent()); + try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) { + XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true); + xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1"); + xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString()); + XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); + xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler()); + } + + countNew++; + } + catch (Exception e) { + e.printStackTrace(); + } + } + + System.out.println("Count " + counter); + System.out.println(" New: " + countNew); + System.out.println(" Exists: " + countExists); + } + } + + @Test + public void testReader2() throws ParserConfigurationException, IOException, UIMAException, SAXException { + List tasks = new ArrayList(); + tasks.add("Start-Up-Aufgabe"); + /*tasks.add("Medizin-Kreislauf"); + tasks.add("Medizin-Mittelohr"); + tasks.add("Piloten-Streik-Aufgabe"); + tasks.add("Startup-Aufgabe"); + tasks.add("Windpark-Aufgabe"); + tasks.add("Nudging-Aufgabe"); + tasks.add("Medizin-Atmung"); + tasks.add("Hitzestift"); + tasks.add("Gruene-Sosse"); + tasks.add("Tetra-Pak");*/ + + for (String task : tasks) { + Path listFile = Paths.get("/storage/projects/CORE/erhebungen/t0/db/tasks/assessment_urls_texts_" + task + "_v2.csv"); + System.out.println(listFile); + try (BufferedReader reader = Files.newBufferedReader(listFile, StandardCharsets.UTF_8)) { + long countNew = 0; + long countExists = 0; + long counter = 0; + boolean skipFirstLine = true; + String line; + while ((line = reader.readLine()) != null) { + try { + counter += 1; + if (counter % 1000 == 0) { + System.out.println(task + ": " + counter); + } + + if (skipFirstLine) { + skipFirstLine = false; + continue; + } + + line = line.trim(); + String[] fields = line.split("\t", -1); + + String user = fields[9]; + String session = fields[4]; + String html = fields[10]; + + String title = html + ".html.gz"; + String docId = user + "/" + session + "/" + title; + String collectionId = "file:/storage/projects/CORE/azure/core-edutec-fileshare/texts/"; + String docBaseUri = collectionId; + String docUri = docBaseUri + docId; + + Path filename = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts/" + docId); + Path output = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi/" + docId + ".xmi.gz"); + if (Files.exists(output)) { + //System.out.println("exists: " + output.toString()); + countExists++; + continue; + } + + JCas jCas = HTMLReadabilityLoader.load(filename, null); + + DocumentMetaData dmd = new DocumentMetaData(jCas); + dmd.setDocumentTitle(title); + dmd.setDocumentId(docId); + dmd.setDocumentUri(docUri); + dmd.setCollectionId(collectionId); + dmd.setDocumentBaseUri(docBaseUri); + dmd.addToIndexes(); + + Files.createDirectories(output.getParent()); + try(GZIPOutputStream outputStream = new GZIPOutputStream(Files.newOutputStream(output))) { + XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true); + xmlSerializer.setOutputProperty(OutputKeys.VERSION, "1.1"); + xmlSerializer.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.toString()); + XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); + xmiCasSerializer.serialize(jCas.getCas(), xmlSerializer.getContentHandler()); + } + + countNew++; + } + catch (Exception e) { + e.printStackTrace(); + } + } + + System.out.println("Count " + counter); + System.out.println(" New: " + countNew); + System.out.println(" Exists: " + countExists); + } + } } @Test @@ -200,4 +509,587 @@ public void testReader() throws Exception { composer.run(reader, "readability_html"); } + + + @Test + public void testDDC2() throws Exception { + // DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/topic_ddc2_100"); + int scale = 20; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String ddcVariant = "ddc2_dim100"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/textimager-duui-ddc-fasttext:latest") + .withParameter("ddc_variant", ddcVariant) + .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + @Test + public void testTopicCardiffnlp() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/topic_cardiffnlp"); + int scale = 2; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = "cardiffnlp/tweet-topic-latest-multi"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-topic:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + @Test + public void testGenreClassla() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/genre_classla"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + DUUIKubernetesDriver kubernetesDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver, kubernetesDriver); + + + String model = ""; + model = "classla/xlm-roberta-base-multilingual-text-genre-classifier"; + //omposer.add(new DUUISwarmDriver. + composer.add(new DUUIKubernetesDriver. + Component("docker.texttechnologylab.org/duui-transformers-topic:latest") + .withParameter("model_name", model) + // .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .withLabels("hostname=isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + +/* + @Test + public void testGenreClasslaSents() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/genre_classla_sents"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = ""; + model = "classla/xlm-roberta-base-multilingual-text-genre-classifier"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-topic:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + + @Test + public void testSentimentCardiffnlp() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/sentiment_cardiffnlp"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = ""; + model = "cardiffnlp/twitter-xlm-roberta-base-sentiment"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-sentiment:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + + @Test + public void testSentimentNLPTown() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/sentiment_nlptown"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = ""; + model = "nlptown/bert-base-multilingual-uncased-sentiment"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-sentiment:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + @Test + public void testSentimentVader() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/sentiment_vader"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/textimager-duui-vader-sentiment:latest") + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + + @Test + public void testToxicCitizenlab() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/toxicity_citizenlab"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = ""; + model = "citizenlab/distilbert-base-multilingual-cased-toxicity"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-toxic:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + + @Test + public void testToxicEIS() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/toxicity_eis"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = ""; + model = "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-toxic:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + } + + + @Test + public void testToxicEIS() throws Exception { + // (NOT) DONE + Path sourceLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy"); + Path targetLocation = Paths.get("/storage/projects/CORE/azure/core-edutec-fileshare/texts_xmi_spacy_plus/toxicity_FredZhang7"); + int scale = 1; + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor( + new DUUIFileReader( + sourceLocation.toString(), + "html.gz.xmi.gz", + 1, + -1, + false, + "", + false, + null, + -1, + targetLocation.toString(), + "html.gz.xmi.gz" + ) + ); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) + .withWorkers(scale) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver(); + DUUISwarmDriver swarmDriver = new DUUISwarmDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + composer.addDriver(uimaDriver, swarmDriver, dockerDriver); + + + String model = ""; + model = "FredZhang7/one-for-all-toxicity-v3"; + composer.add(new DUUISwarmDriver. + Component("docker.texttechnologylab.org/duui-transformers-toxic:latest") + .withParameter("model_name", model) + .withParameter("selection", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") + // .withParameter("selection", "text") + .withScale(scale) + .withConstraintHost("isengart") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, targetLocation.toString(), + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP" + )).build()); + + composer.run(processor, "spacy_plus"); + composer.shutdown(); + }*/ }