diff --git a/pom.xml b/pom.xml
index d7e7cb91..22ae8eb0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -607,6 +607,21 @@
pom
+
+ org.bytedeco
+ javacv
+ 1.5.7
+
+
+ org.bytedeco
+ javacv-platform
+ 1.5.7
+
+
+ org.bytedeco
+ ffmpeg-platform
+ 6.1.1-1.5.10
+
org.jsoup
jsoup
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java
index 6f9ad210..06a3c214 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java
@@ -450,6 +450,7 @@ public void run() {
// TODO thread safety needed for here?
DUUISegmentationStrategy segmentationStrategy = i.getSegmentationStrategy();
if (segmentationStrategy instanceof DUUISegmentationStrategyNone) {
+
i.getDriver().run(i.getUUID(), _jc, perf, composer);
} else {
segmentationStrategy.initialize(_jc);
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java
index cdd7c859..8f429aec 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java
@@ -1,6 +1,7 @@
package org.texttechnologylab.DockerUnifiedUIMAInterface;
import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.uima.cas.CASException;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.jcas.JCas;
@@ -15,10 +16,10 @@
import java.util.Map;
public class DUUIFallbackCommunicationLayer implements IDUUICommunicationLayer {
- public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters) throws CompressorException, IOException, SAXException {
+ public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters, String sourceView) throws CompressorException, IOException, SAXException, CASException {
JSONObject obj = new JSONObject();
ByteArrayOutputStream arr = new ByteArrayOutputStream();
- XmiCasSerializer.serialize(jc.getCas(), null, arr);
+ XmiCasSerializer.serialize(jc.getView(sourceView).getCas(), null, arr);
StringWriter writer = new StringWriter();
TypeSystemUtil.typeSystem2TypeSystemDescription(jc.getTypeSystem()).toXML(writer);
@@ -34,7 +35,7 @@ public void serialize(JCas jc, ByteArrayOutputStream out, Map par
out.write(obj.toString().getBytes(StandardCharsets.UTF_8));
}
- public void deserialize(JCas jc, ByteArrayInputStream input) throws IOException, SAXException {
+ public void deserialize(JCas jc, ByteArrayInputStream input, String targetView) throws IOException, SAXException {
String body = new String(input.readAllBytes(), Charset.defaultCharset());
JSONObject response = new JSONObject(body);
if (response.has("cas") || response.has("error")) {
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java
index 51d92157..7d0da09f 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java
@@ -1,6 +1,7 @@
package org.texttechnologylab.DockerUnifiedUIMAInterface;
import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.xml.sax.SAXException;
@@ -12,6 +13,11 @@
* Interface for communication between the DUUI composer {@link DUUIComposer} and the components {@link org.texttechnologylab.DockerUnifiedUIMAInterface.driver.IDUUIDriverInterface}.
*/
public interface IDUUICommunicationLayer {
+
+ public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters, String sourceView) throws CompressorException, IOException, SAXException, CASException;
+
+ public void deserialize(JCas jc, ByteArrayInputStream input, String targetView) throws IOException, SAXException, CASException;
+
/**
* Serializes a JCas to a byte array output stream by using the LUA script provided by the component.
* @param jc Input JCas.
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java
index 37dc0e02..da41cd0c 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java
@@ -73,9 +73,11 @@ public void run() {
try {
DUUIPipelineDocumentPerformance perf = new DUUIPipelineDocumentPerformance(name, waitTimeEnd - waitTimeStart, jCas, trackErrorDocs);
+ pipelinePart.getDriver().run(pipelinePart.getUUID(), jCas, perf, null);
// TODO!!!! @Daniel
//pipelinePart.getDriver().run(pipelinePart.getUUID(), jCas, perf);
+
if (backend != null) {
backend.addMetricsForDocument(perf);
}
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java
index 3d6aa147..ad307928 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java
@@ -198,7 +198,7 @@ public static IDUUICommunicationLayer responsiveAfterTime(String url, JCas jc, i
ByteArrayOutputStream stream = new ByteArrayOutputStream();
try {
//TODO: Make this accept options to better check the instantiation!
- layer.serialize(jc, stream, null);
+ layer.serialize(jc, stream, null, "_InitialView");
} catch (Exception e) {
e.printStackTrace();
throw new Exception(format("The serialization step of the communication layer fails for implementing class %s", layer.getClass().getCanonicalName()));
@@ -213,7 +213,7 @@ public static IDUUICommunicationLayer responsiveAfterTime(String url, JCas jc, i
if (resp.statusCode() == 200) {
ByteArrayInputStream inputStream = new ByteArrayInputStream(resp.body());
try {
- layer.deserialize(jc, inputStream);
+ layer.deserialize(jc, inputStream, "_InitialView");
} catch (Exception e) {
System.err.printf("Caught exception printing response %s\n", new String(resp.body(), StandardCharsets.UTF_8));
throw e;
@@ -493,6 +493,8 @@ static class InstantiatedComponent implements IDUUIInstantiatedPipelineComponent
private String _reg_username;
private String _uniqueComponentKey;
private Map _parameters;
+ private String _sourceView;
+ private String _targetView;
private DUUIPipelineComponent _component;
@@ -514,6 +516,8 @@ public void addComponent(IDUUIUrlAccessible access) {
_component = comp;
_image_name = comp.getDockerImageName();
_parameters = comp.getParameters();
+ _targetView = comp.getTargetView();
+ _sourceView = comp.getSourceView();
if (_image_name == null) {
throw new InvalidParameterException("The image name was not set! This is mandatory for the DockerLocalDriver Class.");
}
@@ -585,6 +589,10 @@ public Map getParameters() {
return _parameters;
}
+ public String getSourceView() {return _sourceView; }
+
+ public String getTargetView() {return _targetView; }
+
public boolean isWebsocket() {
return _websocket;
}
@@ -602,6 +610,21 @@ public Component withParameter(String key, String value) {
return this;
}
+ public Component withView(String viewName) {
+ _component.withView(viewName);
+ return this;
+ }
+
+ public Component withSourceView(String viewName) {
+ _component.withSourceView(viewName);
+ return this;
+ }
+
+ public Component withTargetView(String viewName) {
+ _component.withTargetView(viewName);
+ return this;
+ }
+
public Component(String target) throws URISyntaxException, IOException {
_component = new DUUIPipelineComponent();
_component.withDockerImageName(target);
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java
index 2be1c825..c253d8de 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java
@@ -439,6 +439,8 @@ static class InstantiatedComponent implements IDUUIInstantiatedPipelineComponent
private int _scale;
private boolean _withImageFetching;
private Map _parameters;
+ private String _sourceView;
+ private String _targetView;
private DUUIPipelineComponent _component;
private final boolean _websocket;
@@ -451,6 +453,8 @@ static class InstantiatedComponent implements IDUUIInstantiatedPipelineComponent
_component = comp;
_image_name = comp.getDockerImageName();
_parameters = comp.getParameters();
+ _targetView = comp.getTargetView();
+ _sourceView = comp.getSourceView();
if (_image_name == null) {
throw new InvalidParameterException("The image name was not set! This is mandatory for the DockerLocalDriver Class.");
}
@@ -523,6 +527,10 @@ public Map getParameters() {
return _parameters;
}
+ public String getSourceView() {return _sourceView; }
+
+ public String getTargetView() {return _targetView; }
+
@Override
public String getUniqueComponentKey() {
return _uniqueComponentKey;
@@ -633,6 +641,21 @@ public Component withParameter(String key, String value) {
return this;
}
+ public Component withView(String viewName) {
+ _component.withView(viewName);
+ return this;
+ }
+
+ public Component withSourceView(String viewName) {
+ _component.withSourceView(viewName);
+ return this;
+ }
+
+ public Component withTargetView(String viewName) {
+ _component.withTargetView(viewName);
+ return this;
+ }
+
/**
* Builds the component.
*
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java
index b0e87e1f..ad8bf111 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java
@@ -65,6 +65,8 @@ public class DUUIPipelineComponent {
private static String driverName = "driver";
private static String descriptionName = "description";
+ private static String sourceView = "sourceView";
+ private static String targetView = "targetView";
private String getVersion() throws URISyntaxException, IOException {
ClassLoader classLoader = DUUIPipelineComponent.class.getClassLoader();
@@ -80,6 +82,7 @@ public DUUIPipelineComponent() throws URISyntaxException, IOException {
_options = new HashMap<>();
_finalizedEncoded = null;
_parameters = new HashMap<>();
+
String version = getVersion();
if(version == null) {
_options.put(versionInformation,"Unknown");
@@ -487,6 +490,22 @@ public DUUIPipelineComponent withParameter(String key, String value) {
return this;
}
+ public DUUIPipelineComponent withView(String viewName){
+ withSourceView(viewName);
+ withTargetView(viewName);
+ return this;
+ }
+
+ public DUUIPipelineComponent withSourceView(String viewName) {
+ _options.put(sourceView, viewName);
+ return this;
+ }
+
+ public DUUIPipelineComponent withTargetView(String viewName) {
+ _options.put(targetView, viewName);
+ return this;
+ }
+
public static DUUIPipelineComponent fromJson(String json) throws URISyntaxException, IOException {
JSONObject jobj = new JSONObject(json);
@@ -560,6 +579,22 @@ public final Map getParameters() {
return _parameters;
}
+ public String getSourceView() {
+ String result = _options.get(sourceView);
+ if(result == null) {
+ return "_InitialView";
+ }
+ return result;
+ }
+
+ public String getTargetView() {
+ String result = _options.get(targetView);
+ if(result == null) {
+ return "_InitialView";
+ }
+ return result;
+ }
+
public DUUIPipelineComponent clearParameters() {
_parameters.clear();
return this;
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java
index b50442cf..4a8d9b97 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java
@@ -81,6 +81,21 @@ public Component withParameter(String key, String value) {
return this;
}
+ public Component withView(String viewName) {
+ component.withView(viewName);
+ return this;
+ }
+
+ public Component withSourceView(String viewName) {
+ component.withSourceView(viewName);
+ return this;
+ }
+
+ public Component withTargetView(String viewName) {
+ component.withTargetView(viewName);
+ return this;
+ }
+
public Component withWebsocket(boolean b) {
component.withWebsocket(b);
return this;
@@ -152,6 +167,8 @@ private static class InstantiatedComponent implements IDUUIInstantiatedPipelineC
private ConcurrentLinkedQueue _components;
private String _uniqueComponentKey;
private Map _parameters;
+ private String _sourceView;
+ private String _targetView;
private DUUIPipelineComponent _component;
private boolean _websocket;
private int _ws_elements;
@@ -179,6 +196,8 @@ public InstantiatedComponent(DUUIPipelineComponent comp) {
}
_parameters = comp.getParameters();
+ _targetView = comp.getTargetView();
+ _sourceView = comp.getSourceView();
_uniqueComponentKey = "";
@@ -208,6 +227,10 @@ public Map getParameters() {
return _parameters;
}
+ public String getSourceView() {return _sourceView; }
+
+ public String getTargetView() {return _targetView; }
+
public boolean isWebsocket() {
return _websocket;
}
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java
index a60f9226..b0773551 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java
@@ -264,6 +264,8 @@ private static class InstantiatedComponent implements IDUUIInstantiatedPipelineC
private final String _reg_password;
private final String _reg_username;
private final Map _parameters;
+ private String _sourceView;
+ private String _targetView;
private DUUIPipelineComponent _component;
@@ -275,6 +277,8 @@ private static class InstantiatedComponent implements IDUUIInstantiatedPipelineC
}
_parameters = comp.getParameters();
+ _targetView = comp.getTargetView();
+ _sourceView = comp.getSourceView();
_scale = comp.getScale(1);
_constraints.addAll(comp.getConstraints());
_components = new ConcurrentLinkedQueue<>();
@@ -374,6 +378,9 @@ public Map getParameters() {
return _parameters;
}
+ public String getSourceView() {return _sourceView; }
+
+ public String getTargetView() {return _targetView; }
public Triplet getComponent() {
long mutexStart = System.nanoTime();
@@ -412,6 +419,21 @@ public Component withParameter(String key, String value) {
return this;
}
+ public Component withView(String viewName) {
+ component.withView(viewName);
+ return this;
+ }
+
+ public Component withSourceView(String viewName) {
+ component.withSourceView(viewName);
+ return this;
+ }
+
+ public Component withTargetView(String viewName) {
+ component.withTargetView(viewName);
+ return this;
+ }
+
public Component withScale(int scale) {
component.withScale(scale);
return this;
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java
index d71513ad..54febe69 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java
@@ -41,6 +41,8 @@ public interface IDUUIInstantiatedPipelineComponent {
public void addComponent(IDUUIUrlAccessible item);
public Map getParameters();
+ public String getSourceView();
+ public String getTargetView();
public String getUniqueComponentKey();
public static TypeSystemDescription getTypesystem(String uuid, IDUUIInstantiatedPipelineComponent comp) throws ResourceInitializationException {
@@ -110,7 +112,7 @@ public static void process(JCas jc, IDUUIInstantiatedPipelineComponent comp, DUU
}
}
- layer.serialize(viewJc,out,comp.getParameters());
+ layer.serialize(viewJc,out,comp.getParameters(), comp.getSourceView());
// lua serialize call()
byte[] ok = out.toByteArray();
@@ -147,7 +149,7 @@ public static void process(JCas jc, IDUUIInstantiatedPipelineComponent comp, DUU
long deserializeStart = annotatorEnd;
try {
- layer.deserialize(viewJc, st);
+ layer.deserialize(viewJc, st, comp.getTargetView());
}
catch(Exception e) {
System.err.printf("Caught exception printing response %s\n",new String(resp.body(), StandardCharsets.UTF_8));
@@ -228,7 +230,7 @@ public static void process_handler(JCas jc,
}
}
// lua serialize call()
- layer.serialize(viewJc,out,comp.getParameters());
+ layer.serialize(viewJc,out,comp.getParameters(), comp.getSourceView());
// ok is the message.
byte[] ok = out.toByteArray();
@@ -265,7 +267,7 @@ public static void process_handler(JCas jc,
* Merging results before deserializing.
*/
result = layer.merge(results);
- layer.deserialize(finalViewJc, result);
+ layer.deserialize(finalViewJc, result, comp.getTargetView());
}
catch(Exception e) {
e.printStackTrace();
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIMultimodalCollectionReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIMultimodalCollectionReader.java
new file mode 100644
index 00000000..77981c06
--- /dev/null
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIMultimodalCollectionReader.java
@@ -0,0 +1,494 @@
+package org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader;
+
+import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.uima.cas.impl.XmiCasDeserializer;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.javaync.io.AsyncFiles;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.AsyncCollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUICollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.monitoring.AdvancedProgressMeter;
+import org.texttechnologylab.utilities.helper.StringUtils;
+import org.xml.sax.SAXException;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Random;
+import java.util.Scanner;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+public class DUUIMultimodalCollectionReader implements DUUICollectionReader {
+
+ private String _path;
+ private ConcurrentLinkedQueue _filePaths;
+ private ConcurrentLinkedQueue _filePathsBackup;
+ private ConcurrentLinkedQueue _loadedFiles;
+
+ private String _viewName;
+
+ private int _initialSize;
+ private AtomicInteger _docNumber;
+ private long _maxMemory;
+ private AtomicLong _currentMemorySize;
+
+ private boolean _addMetadata = true;
+
+ private String _targetPath = null;
+
+ private String _language = null;
+
+ private AdvancedProgressMeter progress = null;
+
+ private int debugCount = 25;
+
+ private String targetLocation = null;
+
+ public DUUIMultimodalCollectionReader(String folder, String ending) {
+ this(folder, ending, "_InitialView", 25, getRandomFromMode(null, -1), getSortFromMode(null), "", true, null, 0, "", null);
+ }
+
+ public DUUIMultimodalCollectionReader(String folder, String ending, String viewName) {
+ this(folder, ending, viewName, 25, getRandomFromMode(null, -1), getSortFromMode(null), "", true, null, 0, "", null);
+ }
+
+ public DUUIMultimodalCollectionReader(String folder, String ending, String viewName, int debugCount, int iRandom, boolean bSort, String savePath, boolean bAddMetadata, String language, int skipSmallerFiles, String targetLocation, String targetEnding) {
+ this.targetLocation = targetLocation;
+ _addMetadata = bAddMetadata;
+ _language = language;
+ _filePaths = new ConcurrentLinkedQueue<>();
+ _loadedFiles = new ConcurrentLinkedQueue<>();
+ _filePathsBackup = new ConcurrentLinkedQueue<>();
+ _viewName = viewName;
+
+ if (new File(savePath).exists() && savePath.length() > 0) {
+ File sPath = new File(savePath);
+
+ String sContent = null;
+ try {
+ sContent = StringUtils.getContent(sPath);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ String[] sSplit = sContent.split("\n");
+
+ for (String s : sSplit) {
+ _filePaths.add(s);
+ }
+
+ } else {
+ File fl = new File(folder);
+ if (!fl.isDirectory()) {
+ throw new RuntimeException("The folder is not a directory!");
+ }
+
+
+ _path = folder;
+ addFilesToConcurrentList(fl, ending, _filePaths);
+
+ if (skipSmallerFiles > 0) {
+ _filePaths = skipBySize(_filePaths, skipSmallerFiles);
+ }
+ }
+
+
+ if (skipSmallerFiles > 0) {
+ _filePaths = skipBySize(_filePaths, skipSmallerFiles);
+ }
+
+ if (bSort) {
+ _filePaths = sortBySize(_filePaths);
+ }
+
+ if (bSort && iRandom > 0) {
+ System.out.println("Sorting and Random Selection is active, using the " + (iRandom > 0 ? "largest " : "smallest ") + Math.abs(iRandom) + " documents.");
+// _filePaths = takeFirstOrLast(_filePaths, iRandom);
+ } else if (iRandom > 0) {
+ _filePaths = random(_filePaths, iRandom);
+ }
+
+ if (savePath.length() > 0) {
+ File nFile = new File(savePath);
+
+ if (!nFile.exists()) {
+ StringBuilder sb = new StringBuilder();
+ _filePaths.forEach(f -> {
+ if (sb.length() > 0) {
+ sb.append("\n");
+ }
+ sb.append(f);
+ });
+ try {
+ StringUtils.writeContent(sb.toString(), nFile);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ // remove files that are already in the target location
+ // NOTE we do this after saving the file list, as we do not want to change anything but only avoid processing files multiple times
+ if (this.targetLocation != null) {
+ // _filePaths = removeIfInTarget(_filePaths, this.targetLocation, targetEnding, this._path, ending);
+ }
+
+ _filePathsBackup.addAll(_filePaths);
+
+ this.debugCount = debugCount;
+
+ System.out.printf("Found %d files matching the pattern! \t Using Random: %d\n", _filePaths.size(), iRandom);
+ _initialSize = _filePaths.size();
+ _docNumber = new AtomicInteger(0);
+ _currentMemorySize = new AtomicLong(0);
+ // 500 MB
+ _maxMemory = 500 * 1024 * 1024;
+
+ progress = new AdvancedProgressMeter(_initialSize);
+ }
+
+ private static int getRandomFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE sampleMode, int sampleSize) {
+ if (sampleMode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.SMALLEST) {
+ return sampleSize * -1;
+ }
+ return sampleSize;
+ }
+
+ private static boolean getSortFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE mode) {
+ if (mode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.RANDOM) {
+ return false;
+ }
+ return true;
+ }
+
+ public static void addFilesToConcurrentList(File folder, String ending, ConcurrentLinkedQueue paths) {
+ File[] listOfFiles = folder.listFiles();
+
+ for (int i = 0; i < listOfFiles.length; i++) {
+ if (listOfFiles[i].isFile()) {
+ if (listOfFiles[i].getName().endsWith(ending)) {
+ paths.add(listOfFiles[i].getPath().toString());
+ }
+ } else if (listOfFiles[i].isDirectory()) {
+ addFilesToConcurrentList(listOfFiles[i], ending, paths);
+ }
+ }
+
+ }
+
+ public static ConcurrentLinkedQueue sortBySize(ConcurrentLinkedQueue paths) {
+
+ ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue();
+
+ rQueue.addAll(paths.stream().sorted((s1, s2) -> {
+ Long firstLength = new File(s1).length();
+ Long secondLength = new File(s2).length();
+
+ return firstLength.compareTo(secondLength) * -1;
+ }).collect(Collectors.toList()));
+
+ return rQueue;
+
+ }
+
+ /**
+ * Skips files smaller than skipSmallerFiles
+ *
+ * @param paths paths to files
+ * @param skipSmallerFiles skip files smaller than this value in bytes
+ * @return filtered paths to files
+ */
+ public static ConcurrentLinkedQueue skipBySize(ConcurrentLinkedQueue paths, int skipSmallerFiles) {
+ ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue<>();
+
+ System.out.println("Skip files smaller than " + skipSmallerFiles + " bytes");
+ System.out.println(" Number of files before skipping: " + paths.size());
+
+ rQueue.addAll(paths
+ .stream()
+ .filter(s -> new File(s).length() >= skipSmallerFiles)
+ .collect(Collectors.toList())
+ );
+
+ System.out.println(" Number of files after skipping: " + rQueue.size());
+
+ return rQueue;
+ }
+
+ public static ConcurrentLinkedQueue random(ConcurrentLinkedQueue paths, int iRandom) {
+
+ ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue();
+
+ Random nRandom = new Random(iRandom);
+
+ ArrayList sList = new ArrayList<>();
+ sList.addAll(paths);
+
+ Collections.shuffle(sList, nRandom);
+
+ if (iRandom > sList.size()) {
+ rQueue.addAll(sList.subList(0, sList.size()));
+ } else {
+ rQueue.addAll(sList.subList(0, iRandom));
+ }
+
+
+ return rQueue;
+
+ }
+
+
+ public static String getSize(String sPath) {
+ return FileUtils.byteCountToDisplaySize(new File(sPath).length());
+ }
+
+ @Override
+ public AdvancedProgressMeter getProgress() {
+ return this.progress;
+ }
+
+ @Override
+ public void getNextCas(JCas empty) {
+ ByteReadFuture future = _loadedFiles.poll();
+
+ byte[] bFile = null;
+ String result = null;
+ if (future == null) {
+ result = _filePaths.poll();
+ if (result == null) return;
+ } else {
+ result = future.getPath();
+ bFile = future.getBytes();
+ long factor = 1;
+ if (result.endsWith(".gz") || result.endsWith(".xz")) {
+ factor = 10;
+ }
+ _currentMemorySize.getAndAdd(-factor * (long) bFile.length);
+ }
+ int val = _docNumber.addAndGet(1);
+
+ progress.setDone(val);
+ progress.setLeft(_initialSize - val);
+
+ if (_initialSize - progress.getCount() > debugCount) {
+ if (val % debugCount == 0 || val == 0) {
+ System.out.printf("%s: \t %s \t %s\n", progress, getSize(result), result);
+ }
+ } else {
+ System.out.printf("%s: \t %s \t %s\n", progress, getSize(result), result);
+ }
+
+ if (bFile == null) {
+ try {
+ bFile = Files.readAllBytes(Path.of(result));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ try {
+ /*
+ if (result.endsWith(".xz")) {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ decodedFile = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.XZ, new ByteArrayInputStream(file));
+ } else if (result.endsWith(".gz")) {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ decodedFile = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.GZIP, new ByteArrayInputStream(file));
+ } else {
+ decodedFile = new ByteArrayInputStream(file);
+ }
+ */
+
+ empty.reset();
+
+ JCas mView;
+ try {
+ mView = empty.getView(_viewName);
+
+ }catch (Exception e){
+ mView = empty.createView(_viewName);
+ }
+
+ var parts = result.split("\\.");
+ String fileExtension = parts[parts.length - 1];
+
+ File fFile = new File(result);
+ String mimeType = Files.probeContentType(fFile.toPath());
+
+
+ if(mimeType == null){
+ if(fileExtension.equals("xmi")){
+ mimeType = "application/xmi";
+ }
+ }
+
+ System.out.println(mimeType);
+
+ String sofaString = "";
+
+ switch(mimeType.split("/")[0]){
+ case "image":
+ case "video":
+ case "audio":
+ sofaString = Base64.encodeBase64String(FileUtils.readFileToByteArray(fFile));
+ mView.setSofaDataString(sofaString, mimeType);
+ break;
+ case "text":
+ sofaString = readFile(fFile);
+ mView.setSofaDataString(sofaString, mimeType);
+
+ break;
+ case "application":
+ if(fileExtension.equals("xmi")) {
+ InputStream decodedFile = new ByteArrayInputStream(Files.readAllBytes(fFile.toPath()));
+ XmiCasDeserializer.deserialize(decodedFile, mView.getCas(), true);
+ break;
+ }
+ else if(mimeType.split("/")[1].equals("x-gzip")){
+ CompressorInputStream decodedFile = new CompressorStreamFactory(true).createCompressorInputStream(CompressorStreamFactory.GZIP, new ByteArrayInputStream(Files.readAllBytes(fFile.toPath())));
+ XmiCasDeserializer.deserialize(decodedFile, mView.getCas(), true);
+ break;
+ }else if(mimeType.split("/")[1].equals("x-xz")){
+ CompressorInputStream decodedFile = new CompressorStreamFactory(true).createCompressorInputStream(CompressorStreamFactory.XZ, new ByteArrayInputStream(Files.readAllBytes(fFile.toPath())));
+ XmiCasDeserializer.deserialize(decodedFile, mView.getCas(), true);
+ break;
+ }
+
+ sofaString = Base64.encodeBase64String(FileUtils.readFileToByteArray(fFile));
+ mView.setSofaDataString(sofaString, mimeType);
+ break;
+ default:
+ try{
+ sofaString = readFile(fFile);
+ }catch(Exception e){
+ sofaString = Base64.encodeBase64String(FileUtils.readFileToByteArray(fFile));
+ }
+ mView.setSofaDataString(sofaString, mimeType);
+ break;
+ }
+ ;
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+
+ if (_addMetadata) {
+ if (JCasUtil.select(empty, DocumentMetaData.class).size() == 0) {
+ DocumentMetaData dmd = DocumentMetaData.create(empty);
+ File pFile = new File(result);
+ dmd.setDocumentId(pFile.getName());
+ dmd.setDocumentTitle(pFile.getName());
+ dmd.setDocumentUri(pFile.getAbsolutePath());
+ dmd.addToIndexes();
+ }
+ }
+
+ if (_language != null && !_language.isEmpty()) {
+ empty.setDocumentLanguage(_language);
+ }
+
+ }
+
+ public void reset() {
+ _filePaths = _filePathsBackup;
+ _docNumber.set(0);
+ progress = new AdvancedProgressMeter(_initialSize);
+ }
+
+ @Override
+ public boolean hasNext() {
+ return _filePaths.size() > 0;
+ }
+
+ @Override
+ public long getSize() {
+ return _filePaths.size();
+ }
+
+ public CompletableFuture getAsyncNextByteArray() throws IOException, CompressorException, SAXException {
+ String result = _filePaths.poll();
+ if (result == null) return CompletableFuture.completedFuture(1);
+ CompletableFuture val = AsyncFiles
+ .readAllBytes(Paths.get(result), 1024 * 1024 * 5)
+ .thenApply(bytes -> {
+ _loadedFiles.add(new ByteReadFuture(result, bytes));
+
+ //Calculate estimated unpacked size by using a compression ratio of 0.1
+ long factor = 1;
+ if (result.endsWith(".gz") || result.endsWith(".xz")) {
+ factor = 10;
+ }
+ _currentMemorySize.getAndAdd(factor * (long) bytes.length);
+ return 0;
+ });
+ return val;
+ }
+
+ @Override
+ public long getDone() {
+ return _docNumber.get();
+ }
+
+ public String formatSize(long lSize) {
+
+ int u = 0;
+ for (; lSize > 1024 * 1024; lSize >>= 10) {
+ u++;
+ }
+ if (lSize > 1024)
+ u++;
+ return String.format("%.1f %cB", lSize / 1024f, " kMGTPE".charAt(u));
+
+ }
+
+ public enum DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE {
+ RANDOM,
+ SMALLEST,
+ LARGEST
+ }
+
+ private String readFile(File file) throws FileNotFoundException {
+ String result = "";
+ Scanner myReader = new Scanner(file);
+ while (myReader.hasNextLine()) {
+ if(result == ""){
+ result = myReader.nextLine();
+ }else{
+ result += "\n" + myReader.nextLine();
+ }
+ }
+
+ return result;
+ }
+
+ class ByteReadFuture {
+ private String _path;
+ private byte[] _bytes;
+
+ public ByteReadFuture(String path, byte[] bytes) {
+ _path = path;
+ _bytes = bytes;
+ }
+
+ public String getPath() {
+ return _path;
+ }
+
+ public byte[] getBytes() {
+ return _bytes;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java
index b661deff..167181cc 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java
@@ -364,7 +364,7 @@ public boolean finishedLoading() {
@Override
public AdvancedProgressMeter getProgress() {
- return this.progress;
+ return progress;
}
@Override
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIYouTubeReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIYouTubeReader.java
new file mode 100644
index 00000000..3c1dbd3b
--- /dev/null
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIYouTubeReader.java
@@ -0,0 +1,607 @@
+package org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader;
+
+import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
+import org.apache.commons.io.FileUtils;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.AsyncCollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUICollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.monitoring.AdvancedProgressMeter;
+import org.texttechnologylab.annotation.socialmedia.metadata.YouTube;
+import org.texttechnologylab.annotation.socialmedia.metadata.youtube.Playlist;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+public class DUUIYouTubeReader implements DUUICollectionReader {
+
+ private String _path;
+ private ConcurrentLinkedQueue _youtubeVideos;
+ private ConcurrentLinkedQueue _youtubeVideosBackup;
+
+ private String _viewName;
+
+ private int _initialSize;
+ private AtomicInteger _docNumber;
+ private long _maxMemory;
+ private AtomicLong _currentMemorySize;
+
+ private boolean _addMetadata = true;
+
+ private String _language = null;
+
+ private AdvancedProgressMeter progress = null;
+
+ private int debugCount = 25;
+
+ private Map> _videosPlaylists;
+ private String _apiKey;
+
+ public DUUIYouTubeReader(String youtubeLink, String apiKey) throws IOException, InterruptedException {
+ this(youtubeLink, apiKey, "_InitialView", 25, getRandomFromMode(null, -1), true, null);
+ }
+
+ public DUUIYouTubeReader(String youtubeLink, String apiKey, String viewName) throws IOException, InterruptedException {
+ this(youtubeLink, apiKey, viewName, 25, getRandomFromMode(null, -1), true, null);
+ }
+
+ public DUUIYouTubeReader(String youtubeLink, String apiKey, String viewName, int debugCount, int iRandom, boolean bAddMetadata, String language) throws IOException, InterruptedException {
+ _addMetadata = bAddMetadata;
+ _language = language;
+ _youtubeVideos = new ConcurrentLinkedQueue<>();
+ _youtubeVideosBackup = new ConcurrentLinkedQueue<>();
+ _videosPlaylists = new HashMap<>();
+ _apiKey = apiKey;
+ _viewName = viewName;
+
+ if(youtubeLink.contains("&list=")) { // Is playlist
+
+ String[] parameters = youtubeLink.split("&");
+
+ String playlistId = "";
+ for (String parameter : parameters) {
+ if (parameter.startsWith("list=")) {
+ playlistId = parameter.substring(5);
+ break;
+ }
+ }
+
+ try {
+ String pageToken = "";
+
+ do{
+ List pagedVideos = new LinkedList<>();
+
+ JSONObject jsonObject = getPlaylistVideos(playlistId, pageToken);
+
+ JSONArray jsonItems = jsonObject.getJSONArray("items");
+
+ for (int i = 0; i < jsonItems.length(); i++) {
+ String videoId = jsonItems.getJSONObject(i).getJSONObject("contentDetails").getString("videoId");
+ pagedVideos.add(new YouTubeVideo(videoId));
+
+ _videosPlaylists.put(videoId, Arrays.asList(playlistId));
+ }
+
+ if(_addMetadata){
+ generateBulkMetadata(pagedVideos);
+ }
+
+ if(jsonObject.has("nextPageToken"))
+ pageToken = jsonObject.getString("nextPageToken");
+ else
+ pageToken = "";
+
+ _youtubeVideos.addAll(pagedVideos);
+ }while(!pageToken.equals(""));
+
+ } catch (Exception e) {
+ throw e;
+ }
+ }else if(youtubeLink.contains("watch?v")) { // Is single video
+ youtubeLink = youtubeLink.split("watch\\?v=")[1].split("&")[0];
+
+ YouTubeVideo video = new YouTubeVideo(youtubeLink);
+ _youtubeVideos.add(video);
+
+ if(_addMetadata){
+ generateMetadata(video);
+ }
+ }else if(youtubeLink.contains("youtu.be/")){ // Is single video with shortened url
+ youtubeLink = youtubeLink.split("youtu.be/")[1].split("&")[0];
+
+ YouTubeVideo video = new YouTubeVideo(youtubeLink);
+ _youtubeVideos.add(video);
+
+ if(_addMetadata){
+ generateMetadata(video);
+ }
+ }else{ // Is Channel
+
+ String pageToken = "";
+ String channelId = null;
+
+ if(youtubeLink.contains("/@")){
+ channelId = getChannelIdByHandle(youtubeLink.split("@")[1].split("/")[0]);
+ }
+ else if(youtubeLink.contains("/channel/")){
+ channelId = youtubeLink.split("/channel/")[1].split("/")[0];
+ }
+
+ if(channelId != null){
+
+ do{
+ List pagedVideos = new LinkedList<>();
+
+ JSONObject jsonObject = getChannelVideosByChannelId(channelId, "");
+
+ JSONArray jsonItems = jsonObject.getJSONArray("items");
+
+ for (int i = 0; i < jsonItems.length(); i++) {
+ JSONObject idRequestObject = jsonItems.getJSONObject(i).getJSONObject("id");
+
+ if(!idRequestObject.has("videoId")) continue; // Found own channel instead of video
+
+ String videoId = idRequestObject.getString("videoId");
+ pagedVideos.add(new YouTubeVideo(videoId));
+ System.out.println("Added video: " + i);
+ }
+
+ if(_addMetadata){
+ generateBulkMetadata(pagedVideos);
+ }
+
+ if(jsonObject.has("nextPageToken"))
+ pageToken = jsonObject.getString("nextPageToken");
+ else
+ pageToken = "";
+
+ _youtubeVideos.addAll(pagedVideos);
+
+ }while(!pageToken.equals(""));
+
+ }
+
+ }
+
+ if (iRandom > 0) {
+ _youtubeVideos = random(_youtubeVideos, iRandom);
+ }
+
+ _youtubeVideosBackup.addAll(_youtubeVideos);
+
+ this.debugCount = debugCount;
+
+ System.out.printf("Found %d files matching the pattern! \t Using Random: %d\n", _youtubeVideos.size(), iRandom);
+ _initialSize = _youtubeVideos.size();
+ _docNumber = new AtomicInteger(0);
+ _currentMemorySize = new AtomicLong(0);
+ // 500 MB
+ _maxMemory = 500 * 1024 * 1024;
+
+ progress = new AdvancedProgressMeter(_initialSize);
+ }
+
+ private static int getRandomFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE sampleMode, int sampleSize) {
+ if (sampleMode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.SMALLEST) {
+ return sampleSize * -1;
+ }
+ return sampleSize;
+ }
+
+ private static boolean getSortFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE mode) {
+ if (mode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.RANDOM) {
+ return false;
+ }
+ return true;
+ }
+
+ public static void addFilesToConcurrentList(File folder, String ending, ConcurrentLinkedQueue paths) {
+ File[] listOfFiles = folder.listFiles();
+
+ for (int i = 0; i < listOfFiles.length; i++) {
+ if (listOfFiles[i].isFile()) {
+ if (listOfFiles[i].getName().endsWith(ending)) {
+ paths.add(listOfFiles[i].getPath().toString());
+ }
+ } else if (listOfFiles[i].isDirectory()) {
+ addFilesToConcurrentList(listOfFiles[i], ending, paths);
+ }
+ }
+
+ }
+
+ public static ConcurrentLinkedQueue sortBySize(ConcurrentLinkedQueue paths) {
+
+ ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue();
+
+ rQueue.addAll(paths.stream().sorted((s1, s2) -> {
+ Long firstLength = new File(s1).length();
+ Long secondLength = new File(s2).length();
+
+ return firstLength.compareTo(secondLength) * -1;
+ }).collect(Collectors.toList()));
+
+ return rQueue;
+
+ }
+
+ /**
+ * Skips files smaller than skipSmallerFiles
+ *
+ * @param paths paths to files
+ * @param skipSmallerFiles skip files smaller than this value in bytes
+ * @return filtered paths to files
+ */
+ public static ConcurrentLinkedQueue skipBySize(ConcurrentLinkedQueue paths, int skipSmallerFiles) {
+ ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue<>();
+
+ System.out.println("Skip files smaller than " + skipSmallerFiles + " bytes");
+ System.out.println(" Number of files before skipping: " + paths.size());
+
+ rQueue.addAll(paths
+ .stream()
+ .filter(s -> new File(s).length() >= skipSmallerFiles)
+ .collect(Collectors.toList())
+ );
+
+ System.out.println(" Number of files after skipping: " + rQueue.size());
+
+ return rQueue;
+ }
+
+ public static ConcurrentLinkedQueue random(ConcurrentLinkedQueue videos, int iRandom) {
+
+ ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue();
+
+ Random nRandom = new Random(iRandom);
+
+ ArrayList sList = new ArrayList<>();
+ sList.addAll(videos);
+
+ Collections.shuffle(sList, nRandom);
+
+ if (iRandom > sList.size()) {
+ rQueue.addAll(sList.subList(0, sList.size()));
+ } else {
+ rQueue.addAll(sList.subList(0, iRandom));
+ }
+
+
+ return rQueue;
+
+ }
+
+ @Override
+ public AdvancedProgressMeter getProgress() {
+ return this.progress;
+ }
+
+ @Override
+ public void getNextCas(JCas empty) {
+
+ YouTubeVideo result = _youtubeVideos.poll();
+
+ int val = _docNumber.addAndGet(1);
+
+ progress.setDone(val);
+ progress.setLeft(_initialSize - val);
+
+ if (_initialSize - progress.getCount() > debugCount) {
+ if (val % debugCount == 0 || val == 0) {
+ System.out.printf("%s \t %s\n", progress, result.getVideoUrl());
+ }
+ } else {
+ System.out.printf("%s \t %s\n", progress, result.getVideoUrl());
+ }
+
+ try {
+ empty.reset();
+
+ JCas ytView;
+ try {
+ ytView = empty.getView(_viewName);
+
+ }catch (Exception e){
+ ytView = empty.createView(_viewName);
+ }
+
+ ytView.setSofaDataString(result.getVideoUrl(), "text/x-uri");
+
+ if(_addMetadata)
+ setVideoMetadata(result, ytView);
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ if (_addMetadata) {
+ if (JCasUtil.select(empty, DocumentMetaData.class).size() == 0) {
+ DocumentMetaData dmd = DocumentMetaData.create(empty);
+ dmd.setDocumentId(result._id);
+ dmd.setDocumentTitle(result._title);
+ //dmd.setDocumentUri(result.getVideoUrl());
+ dmd.addToIndexes();
+ }
+ }
+
+ if (_language != null && !_language.isEmpty()) {
+ empty.setDocumentLanguage(_language);
+ }
+
+ }
+
+ public void reset() {
+ _youtubeVideos = _youtubeVideosBackup;
+ _docNumber.set(0);
+ progress = new AdvancedProgressMeter(_initialSize);
+ }
+
+ @Override
+ public boolean hasNext() {
+ return _youtubeVideos.size() > 0;
+ }
+
+ @Override
+ public long getSize() {
+ return _youtubeVideos.size();
+ }
+
+ @Override
+ public long getDone() {
+ return _docNumber.get();
+ }
+
+ public String formatSize(long lSize) {
+
+ int u = 0;
+ for (; lSize > 1024 * 1024; lSize >>= 10) {
+ u++;
+ }
+ if (lSize > 1024)
+ u++;
+ return String.format("%.1f %cB", lSize / 1024f, " kMGTPE".charAt(u));
+
+ }
+
+ public enum DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE {
+ RANDOM,
+ SMALLEST,
+ LARGEST
+ }
+
+ private JSONObject getPlaylistVideos(String playlistId, String pageToken) throws IOException, InterruptedException {
+ String url = "https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=" + playlistId + "&key=" + _apiKey + "&maxResults=50&pageToken=" + pageToken;
+ HttpClient client = HttpClient.newHttpClient();
+ HttpRequest request = HttpRequest.newBuilder()
+ .uri(URI.create(url))
+ .build();
+
+ HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString());
+
+ return new JSONObject(response.body().toString());
+ }
+
+ private String getChannelIdByHandle(String channelHandle) throws IOException, InterruptedException {
+ String url = "https://youtube.googleapis.com/youtube/v3/search?part=snippet&maxResults=1&q=" + channelHandle + "&type=channel&key=" + _apiKey;
+ HttpClient client = HttpClient.newHttpClient();
+ HttpRequest request = HttpRequest.newBuilder()
+ .uri(URI.create(url))
+ .build();
+
+ HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString());
+
+ JSONObject jsonObject = new JSONObject(response.body().toString());
+ JSONArray resultArray = jsonObject.getJSONArray("items");
+
+ if(resultArray.length() == 0) return null;
+
+ return resultArray.getJSONObject(0).getJSONObject("id").getString("channelId");
+ }
+
+ private JSONObject getChannelVideosByChannelId(String channelId, String pageToken) throws IOException, InterruptedException {
+ String url = "https://www.googleapis.com/youtube/v3/search?key=" + _apiKey + "&channelId=" + channelId +"&part=id&order=date&maxResults=50&pageToken=" + pageToken;
+
+ HttpClient client = HttpClient.newHttpClient();
+ HttpRequest request = HttpRequest.newBuilder()
+ .uri(URI.create(url))
+ .build();
+
+ HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString());
+
+ return new JSONObject(response.body().toString());
+ }
+
+ private void setVideoMetadata(YouTubeVideo video, JCas jCas) throws IOException, InterruptedException{
+
+ YouTube youTube = new YouTube(jCas);
+
+ if(_videosPlaylists.containsKey(video.getVideoId())){
+ List playlistIds = _videosPlaylists.get(video.getVideoId());
+ Playlist[] playlists = new Playlist[_videosPlaylists.get(video.getVideoId()).size()];
+
+ for (int i = 0; i < playlistIds.size(); i++){
+ Playlist playlist = new Playlist(jCas);
+
+ String playlistUrl = "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=" + playlistIds.get(i) + "&key=" + _apiKey;
+
+ HttpClient client = HttpClient.newHttpClient();
+ HttpRequest request = HttpRequest.newBuilder()
+ .uri(URI.create(playlistUrl))
+ .build();
+
+ HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString());
+
+
+ JSONObject playlistJsonObject = new JSONObject(response.body().toString());
+ JSONObject playlistItem = playlistJsonObject.getJSONArray("items").getJSONObject(0);
+ JSONObject playlistSnippet = playlistItem.getJSONObject("snippet");
+
+ playlist.setName(playlistSnippet.getString("title"));
+ playlist.setCreateDate(youtubeDateToInt(playlistSnippet.getString("publishedAt")));
+ playlist.setUrl("https://www.youtube.com/watch?v=" + video.getVideoId() + "&list=" + playlistIds.get(i));
+
+ playlists[i] = playlist;
+ }
+
+ FSList list = FSList.create(jCas, playlists);
+ list.addToIndexes();
+ }
+
+ youTube.setUrl(video.getVideoUrl());
+ youTube.setChannelName(video._channelName);
+ youTube.setChannelURL(video._channelUrl);
+ youTube.setLength(video._duration);
+ youTube.setViews(video._views);
+ youTube.setLikes(video._likes);
+ youTube.setDislikes(0); // Does not support dislikes
+ youTube.setCreateDate(video._createDate);
+
+ youTube.setDownloadDate(currentDateToInt());
+ youTube.addToIndexes();
+ }
+
+ private void generateMetadata(YouTubeVideo video) throws IOException, InterruptedException {
+ List videos = new LinkedList<>();
+ videos.add(video);
+ generateBulkMetadata(videos);
+ }
+
+ private void generateBulkMetadata(List videos) throws IOException, InterruptedException {
+ if(videos.isEmpty()) return;
+
+ String ids = "";
+
+ for(YouTubeVideo video : videos){
+ if(ids.equals("")){
+ ids = video.getVideoId();
+ }else{
+ ids += "," + video.getVideoId();
+ }
+ }
+
+
+ String url = "https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2Cstatistics%2CcontentDetails&id=" + ids + "&key=" + _apiKey;
+ HttpClient client = HttpClient.newHttpClient();
+ HttpRequest request = HttpRequest.newBuilder()
+ .uri(URI.create(url))
+ .build();
+
+ HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString());
+
+ JSONObject jsonObject = new JSONObject(response.body().toString());
+
+ JSONArray items = jsonObject.getJSONArray("items");
+
+ for(int i = 0; i < items.length(); i++){
+ JSONObject snippet = items.getJSONObject(i).getJSONObject("snippet");
+ JSONObject statistics = items.getJSONObject(i).getJSONObject("statistics");
+ JSONObject contentDetails = items.getJSONObject(i).getJSONObject("contentDetails");
+
+ videos.get(i).setMetadata(snippet, statistics, contentDetails);
+ }
+ }
+
+
+ private int youtubeDateToInt(String youtubeDate){
+ String[] dateElements = youtubeDate.split("T")[0].split("-"); // Seperate date and time
+
+ if(dateElements[0].length() == 1)
+ dateElements[0] = "0" + dateElements[0];
+
+ if(dateElements[1].length() == 1)
+ dateElements[1] = "0" + dateElements[1];
+
+ int iCreateDate = Integer.parseInt(dateElements[2] + dateElements[1] + dateElements[0]);
+ return iCreateDate;
+ }
+
+ private int currentDateToInt(){
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("ddMMyyyy");
+ return Integer.parseInt(ZonedDateTime.now().format(formatter));
+ }
+
+ private String readFile(File file) throws FileNotFoundException {
+ String result = "";
+ Scanner myReader = new Scanner(file);
+ while (myReader.hasNextLine()) {
+ if(result == ""){
+ result = myReader.nextLine();
+ }else{
+ result += "\n" + myReader.nextLine();
+ }
+ }
+
+ return result;
+ }
+
+ class YouTubeVideo{
+ private String _id;
+ private String _channelName;
+ private String _channelUrl;
+ private String _title;
+ private int _duration;
+ private int _views;
+ private int _likes;
+ private int _createDate;
+
+ public YouTubeVideo(String id){
+ _id = id;
+ }
+
+ public String getVideoId(){
+ return _id;
+ }
+
+ public String getVideoUrl(){
+ return "https://www.youtube.com/watch?v=" + _id;
+ }
+
+ public void setMetadata(JSONObject snippet, JSONObject statistics, JSONObject contentDetails){
+
+ _title = snippet.getString("title");
+ _channelName = snippet.getString("channelTitle");
+ _channelUrl = "https://www.youtube.com/channel/" + snippet.getString("channelId");
+ String sDuration = contentDetails.getString("duration").substring(2);
+ int iDuration = 0;
+
+ if(sDuration.contains("H")){
+ String[] hours = sDuration.split("H");
+ String[] minutes = hours[1].split("M");
+ String seconds = minutes[1].split("S")[0];
+
+ iDuration = Integer.parseInt(hours[0]) * 360 + Integer.parseInt(minutes[0]) * 60 + Integer.parseInt(seconds);
+ }else if(sDuration.contains("M")){
+ String[] minutes = sDuration.split("M");
+ String seconds = minutes[1].split("S")[0];
+
+ iDuration = Integer.parseInt(minutes[0]) * 60 + Integer.parseInt(seconds);
+ }else if(sDuration.contains("S")){
+ String seconds = sDuration.split("S")[0];
+
+ iDuration = Integer.parseInt(seconds);
+ }
+
+ _duration = iDuration;
+ _views = Integer.parseInt(statistics.getString("viewCount"));
+ _likes = Integer.parseInt(statistics.getString("likeCount"));
+
+ _createDate = youtubeDateToInt(snippet.getString("publishedAt"));
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/writer/AudioSegmentWriter.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/writer/AudioSegmentWriter.java
new file mode 100644
index 00000000..5674f33d
--- /dev/null
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/writer/AudioSegmentWriter.java
@@ -0,0 +1,69 @@
+package org.texttechnologylab.DockerUnifiedUIMAInterface.io.writer;
+
+import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
+import org.apache.commons.io.FileUtils;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.dkpro.core.api.io.JCasFileWriter_ImplBase;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.tools.MultimodalUtil;
+import org.texttechnologylab.annotation.type.AudioToken;
+
+import java.io.File;
+import java.io.IOException;
+
+public class AudioSegmentWriter extends JCasFileWriter_ImplBase {
+
+ public static final String PARAM_AUDIO_TOKEN_VIEW = "audioTokenView";
+ @ConfigurationParameter(name = PARAM_AUDIO_TOKEN_VIEW, defaultValue = "_InitialView")
+ private String audioTokenView;
+
+ public static final String PARAM_AUDIO_CONTENT_VIEW = "audioView";
+ @ConfigurationParameter(name = PARAM_AUDIO_CONTENT_VIEW, defaultValue = "_InitialView")
+ private String audioView;
+
+ @Override
+ public void process(JCas jCas) {
+
+
+ try {
+ DocumentMetaData meta = null;
+ if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) {
+ meta = DocumentMetaData.get(jCas);
+ }
+
+ DocumentMetaData finalMeta = meta;
+
+ JCas audioFileView = jCas.getView(audioView);
+
+ MultimodalUtil.getAllCoveredAudio(jCas.getView(audioTokenView), audioFileView, AudioToken.class, "wav").forEach(file -> {
+
+ String moveTo = getTargetLocation();
+
+ if(!moveTo.endsWith("/") && !moveTo.endsWith("\\")){
+ moveTo = moveTo + "/";
+ }
+
+ String documentName;
+
+ if(finalMeta != null && finalMeta.getDocumentId() != null){
+ documentName = finalMeta.getDocumentId() + "_";
+ }else{
+ documentName = "File_";
+ }
+
+ try {
+ FileUtils.moveFile(new File(file.getAbsolutePath()), new File(moveTo + documentName + file.getName()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ );
+ } catch (CASException e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java
index fcb99e1f..fb1777de 100644
--- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java
@@ -1,6 +1,7 @@
package org.texttechnologylab.DockerUnifiedUIMAInterface.lua;
import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.luaj.vm2.LuaTable;
import org.luaj.vm2.lib.jse.CoerceJavaToLua;
@@ -32,18 +33,37 @@ public DUUILuaCommunicationLayer(String script, String origin, DUUILuaContext gl
_globalContext = globalContext;
}
- public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters) throws CompressorException, IOException, SAXException {
+ public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters) throws CompressorException, IOException, SAXException, CASException {
+ serialize(jc, out, parameters, "_InitialView");
+ }
+
+ public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters, String sourceView) throws CompressorException, IOException, SAXException, CASException {
LuaTable params = new LuaTable();
if (parameters != null) {
for (String key : parameters.keySet()) {
params.set(key, parameters.get(key));
}
}
- _file.call("serialize",CoerceJavaToLua.coerce(jc),CoerceJavaToLua.coerce(out), params);
+
+ _file.call("serialize",CoerceJavaToLua.coerce(jc.getView(sourceView)),CoerceJavaToLua.coerce(out), params);
+ }
+
+ public void deserialize(JCas jc, ByteArrayInputStream input) throws IOException, SAXException, CASException {
+ deserialize(jc, input, "_InitialView");
}
- public void deserialize(JCas jc, ByteArrayInputStream input) throws IOException, SAXException {
- _file.call("deserialize",CoerceJavaToLua.coerce(jc),CoerceJavaToLua.coerce(input));
+
+ public void deserialize(JCas jc, ByteArrayInputStream input, String targetView) throws IOException, SAXException, CASException {
+
+ JCas tJc;
+
+ try{
+ tJc = jc.getView(targetView);
+ }catch (Exception e){
+ tJc = jc.createView(targetView);
+ }
+
+ _file.call("deserialize",CoerceJavaToLua.coerce(tJc),CoerceJavaToLua.coerce(input));
}
diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/MultimodalUtil.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/MultimodalUtil.java
new file mode 100644
index 00000000..bc34867e
--- /dev/null
+++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/MultimodalUtil.java
@@ -0,0 +1,560 @@
+package org.texttechnologylab.DockerUnifiedUIMAInterface.tools;
+
+import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.texttechnologylab.annotation.AnnotationComment;
+import org.texttechnologylab.annotation.type.AudioToken;
+import org.texttechnologylab.annotation.type.Coordinate;
+import org.texttechnologylab.annotation.type.SubImage;
+
+import javax.imageio.ImageIO;
+import java.awt.*;
+import java.awt.geom.Path2D;
+import java.awt.image.BufferedImage;
+import java.awt.image.RenderedImage;
+import java.io.*;
+import java.nio.file.Files;
+import java.util.*;
+import java.util.List;
+
+public class MultimodalUtil {
+
+ /**
+ * Converts each AudioTokens into its own audio snippet.
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param annotationClass TThe annotation the covered elements are derived from
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static List getAllCoveredAudio(JCas audioTokenView, Class annotationClass) throws CASException {
+ return getAllCoveredAudio(audioTokenView, null, annotationClass, "wav");
+ }
+
+ /**
+ * Converts each AudioTokens into its own audio snippet.
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param annotationClass The annotation the covered elements are derived from
+ * @param targetFormat File format for the output file. (like "wav" or "mp3")
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static List getAllCoveredAudio(JCas audioTokenView, Class annotationClass, String targetFormat) throws CASException {
+ return getAllCoveredAudio(audioTokenView, null, annotationClass, targetFormat);
+ }
+
+ /**
+ * Converts each AudioTokens into its own audio snippet.
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param annotationClass The annotation the covered elements are derived from
+ * @param audioFileView The view containing the entire audio file in its sofa string (If null, tries to auto-detect)
+ * @param targetFormat File format for the output file. (like "wav" or "mp3")
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static List getAllCoveredAudio(JCas audioTokenView, JCas audioFileView, Class annotationClass, String targetFormat) throws CASException {
+
+ List files = new ArrayList<>();
+ List commands = new LinkedList<>();
+
+ JCasUtil.select(audioTokenView, annotationClass).forEach(annotation -> {
+
+ float startTime = Integer.MAX_VALUE;
+ float endTime = 0;
+
+ List tokens = JCasUtil.selectOverlapping(AudioToken.class, annotation).stream().toList();
+
+ for(AudioToken token : tokens){
+ System.out.println(token.getTimeStart() + " " + token.getTimeEnd());
+ if(token.getTimeStart() < startTime)
+ startTime = token.getTimeStart();
+
+ if(token.getTimeEnd() > endTime)
+ endTime = token.getTimeEnd();
+ }
+
+ if(startTime == Integer.MAX_VALUE) {
+ return;
+ }
+
+ commands.add(String.format("-ss %s -t %s %s",
+ startTime,
+ endTime - startTime,
+ getOutputName(audioTokenView, annotation, targetFormat)));
+
+
+ File file = new File(getOutputName(audioTokenView, annotation, targetFormat));
+ file.deleteOnExit();
+ files.add(file);
+ });
+
+ MultimodalUtil.getEveryAudioSegment(audioTokenView, audioFileView, commands);
+
+ return files;
+ }
+
+ /**
+ * Converts a AudioToken into its own audio snippet
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param audioToken The audio token class (like AudioToken)
+ * @return A file containing the audio segment
+ * @throws CASException
+ */
+ public static File getCoveredAudio(JCas audioTokenView, AudioToken audioToken) throws CASException {
+ return getCoveredAudio(audioTokenView, null, audioToken, "wav");
+ }
+
+ /**
+ * Converts a AudioToken into its own audio snippet
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param audioToken The audio token class (like AudioToken)
+ * @param targetFormat File format for the output file. (like "wav" or "mp3")
+ * @return A file containing the audio segment
+ * @throws CASException
+ */
+ public static File getCoveredAudio(JCas audioTokenView, AudioToken audioToken, String targetFormat) throws CASException {
+ return getCoveredAudio(audioTokenView, null, audioToken, targetFormat);
+ }
+
+ /**
+ * Converts a AudioToken into its own audio snippet
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param audioToken The audio token class (like AudioToken)
+ * @param audioFileView The view containing the entire audio file in its sofa string (If null, tries to auto-detect)
+ * @param targetFormat File format for the output file. (like "wav" or "mp3")
+ * @return A file containing the audio segment
+ * @throws CASException
+ */
+ public static File getCoveredAudio(JCas audioTokenView, JCas audioFileView, AudioToken audioToken, String targetFormat) throws CASException {
+
+ if(audioFileView == null)
+ audioFileView = findAudioView(audioTokenView);
+
+ String inputFileName = "temp_" + audioFileView.getViewName();
+ String outputFileName = getOutputName(audioTokenView, audioToken, targetFormat);
+
+ if(!new File(inputFileName).exists()) {
+ // Convert encoded string to file
+ OutputStream stream = null;
+ try {
+ stream = new FileOutputStream(inputFileName);
+ stream.write(Base64.decodeBase64(audioFileView.getSofaDataString()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (stream != null) {
+ try {
+ stream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ File inputFile = new File(inputFileName);
+ inputFile.deleteOnExit();
+
+ executeFFMpeg(inputFile.getAbsolutePath(), outputFileName, audioToken.getTimeStart(), audioToken.getTimeEnd());
+
+ inputFile.delete();
+
+ File outputFile = new File(outputFileName);
+ outputFile.deleteOnExit();
+
+ return outputFile;
+ }
+
+ private static void getEveryAudioSegment(JCas audioTokenCas, JCas audioFileView, List commands) throws CASException {
+
+ if(audioFileView == null)
+ audioFileView = findAudioView(audioTokenCas);
+
+ String inputFileName = "temp_" + audioFileView.getViewName();
+
+ if(!new File(inputFileName).exists()) {
+ // Convert encoded string to file
+ OutputStream stream = null;
+ try {
+ stream = new FileOutputStream(inputFileName);
+ stream.write(Base64.decodeBase64(audioFileView.getSofaDataString()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (stream != null) {
+ try {
+ stream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ }
+
+ File inputFile = new File(inputFileName);
+ inputFile.deleteOnExit();
+
+ executeFFMpeg(inputFile.getAbsolutePath(), commands);
+ inputFile.delete();
+ }
+
+ /**
+ * Tries to find the view which contains the entire audio file in its sofa string
+ * @param fromJCas JCas to search the views in
+ * @throws CASException
+ */
+ public static JCas findAudioView(JCas fromJCas) throws CASException {
+ Iterator iter = fromJCas.getViewIterator();
+
+ while(iter.hasNext()){
+ JCas view = iter.next();
+
+ if(view.getSofaMimeType().startsWith("audio/")){
+ return view;
+ }
+ }
+
+ return fromJCas;
+ }
+
+
+ /**
+ * Converts each AudioToken into its own video snippet.
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param annotationClass The annotation the covered elements are derived from
+ * @return List of files, each file containing the video content.
+ * @throws CASException
+ */
+ public static List getAllCoveredVideo(JCas audioTokenView, Class annotationClass) throws CASException {
+ return getAllCoveredVideo(null, audioTokenView, annotationClass);
+ }
+
+ /**
+ * Converts each AudioToken into its own audio snippet.
+ * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect)
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param annotationClass The annotation the covered elements are derived from
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static List getAllCoveredVideo(JCas audioTokenView, JCas videoFileView, Class annotationClass) throws CASException {
+ return getAllCoveredVideo(videoFileView, audioTokenView, annotationClass, "mp4");
+ }
+
+ /**
+ * Converts each AudioToken into its own audio snippet.
+ * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect)
+ * @param audioTokenView The view in which the audio tokens are stored
+ * @param annotationClass The annotation the covered elements are derived from
+ * @param targetFormat File format for the output file. (like "mp4" or "webm")
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static List getAllCoveredVideo(JCas audioTokenView, JCas videoFileView, Class annotationClass, String targetFormat) throws CASException {
+
+ if(videoFileView == null){
+ videoFileView = findVideoView(audioTokenView);
+ }
+
+ List files = new ArrayList<>();
+ List commands = new LinkedList<>();
+
+ JCasUtil.select(audioTokenView, annotationClass).forEach(annotation -> {
+
+ float startTime = Integer.MAX_VALUE;
+ float endTime = 0;
+
+ List tokens = JCasUtil.selectOverlapping(AudioToken.class, annotation).stream().toList();
+
+ for(AudioToken token : tokens){
+ if(token.getTimeStart() < startTime)
+ startTime = token.getTimeStart();
+
+ if(token.getTimeEnd() > endTime)
+ endTime = token.getTimeEnd();
+ }
+
+ if(startTime == Integer.MAX_VALUE)
+ return;
+/*
+ System.out.println("============================");
+ System.out.println(getOutputName(audioTokenView, annotation, targetFormat));
+ System.out.println(startTime + " " + endTime);
+ */
+
+ commands.add(String.format("-ss %s -t %s %s",
+ startTime,
+ endTime - startTime,
+ getOutputName(audioTokenView, annotation, targetFormat)));
+
+
+ File file = new File(getOutputName(audioTokenView, annotation, targetFormat));
+ file.deleteOnExit();
+ files.add(file);
+ });
+
+ MultimodalUtil.getEveryVideoSegment(videoFileView, commands);
+
+ return files;
+ }
+
+
+ /**
+ * Converts each AudioTokens into its own audio snippet.
+ * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect)
+ * @param audioToken The audio token class (like AudioToken)
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static File getCoveredVideo(JCas videoFileView, AudioToken audioToken){
+ return getCoveredVideo(videoFileView, audioToken, "mp4");
+ }
+
+ /**
+ * Converts each AudioTokens into its own audio snippet.
+ * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect)
+ * @param audioToken The audio token class (like AudioToken)
+ * @param targetFormat File format for the output file. (like "mp4" or "webm")
+ * @return List of files, each file containing the audio content.
+ * @throws CASException
+ */
+ public static File getCoveredVideo(JCas videoFileView, AudioToken audioToken, String targetFormat){
+
+ String inputFileName = "temp_" + videoFileView.getViewName();
+ String outputFileName = getOutputName(videoFileView, audioToken, targetFormat);
+
+ if(!new File("temp_" + videoFileView.getViewName()).exists()) {
+ // Convert encoded string to file
+ OutputStream stream = null;
+ try {
+ stream = new FileOutputStream(inputFileName);
+ stream.write(Base64.decodeBase64(videoFileView.getSofaDataString()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (stream != null) {
+ try {
+ stream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ File inputFile = new File(inputFileName);
+ inputFile.deleteOnExit();
+
+ executeFFMpeg(inputFile.getAbsolutePath(), outputFileName, audioToken.getTimeStart(), audioToken.getTimeEnd());
+
+ File outputFile = new File(outputFileName);
+ outputFile.deleteOnExit();
+
+ return outputFile;
+ }
+
+ private static void getEveryVideoSegment(JCas videoViewCas, List commands){
+
+ String inputFileName = "temp_" + videoViewCas.getViewName();
+
+ if(!new File("temp_" + videoViewCas.getViewName()).exists()) {
+ // Convert encoded string to file
+ OutputStream stream = null;
+ try {
+ stream = new FileOutputStream(inputFileName);
+ stream.write(Base64.decodeBase64(videoViewCas.getSofaDataString()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ if (stream != null) {
+ try {
+ stream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ File inputFile = new File(inputFileName);
+ inputFile.deleteOnExit();
+
+ executeFFMpeg(inputFile.getAbsolutePath(), commands);
+ }
+
+ /**
+ * Tries to find the view which contains the entire video file in its sofa string
+ * @param fromJCas JCas to search the views in
+ * @throws CASException
+ */
+ public static JCas findVideoView(JCas fromJCas) throws CASException {
+ Iterator iter = fromJCas.getViewIterator();
+
+ while(iter.hasNext()){
+ JCas view = iter.next();
+
+ if(view.getSofaMimeType().startsWith("video/")){
+ return view;
+ }
+ }
+
+ return fromJCas;
+ }
+
+ private static void executeFFMpeg(String absoluteInputPath, String output, float startTime, float endTime){
+ try {
+ // -ss: seeking (skipping forward x seconds)
+ // -t: duration
+ ProcessBuilder pb = new ProcessBuilder("ffmpeg", "-ss,", Float.toString(startTime), "-t", Float.toString(endTime - startTime), "-i", absoluteInputPath, output);
+
+ pb.redirectOutput(ProcessBuilder.Redirect.INHERIT);
+ pb.redirectError(ProcessBuilder.Redirect.INHERIT);
+ Process p = pb.start();
+ p.waitFor();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static void executeFFMpeg(String absoluteInputPath, List commands){
+ try {
+ // -ss: seeking (skipping forward x seconds)
+ // -t: duration
+ ProcessBuilder pb = new ProcessBuilder("ffmpeg", "-i", absoluteInputPath);
+
+ for(String outputCommand : commands){
+ pb.command().addAll(Arrays.stream(outputCommand.split(" ")).toList());
+ }
+
+ pb.redirectOutput(ProcessBuilder.Redirect.INHERIT);
+ pb.redirectError(ProcessBuilder.Redirect.INHERIT);
+ Process p = pb.start();
+ p.waitFor();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static List getSubImages(JCas jCas){
+ return getSubImages(jCas, null);
+ }
+
+ public static List getSubImages(JCas jCas, String overrideExtension) {
+
+ List subImages = new ArrayList<>();
+
+ JCasUtil.select(jCas, SubImage.class).forEach(subImage -> {
+ byte[] base64Image = Base64.decodeBase64(subImage.getParent().getSrc());
+ try {
+ BufferedImage bImage = ImageIO.read(new ByteArrayInputStream(base64Image));
+
+ Polygon polygon = new Polygon();
+
+ for(int i = 0; i < subImage.getCoordinates().size(); i++){
+
+ Coordinate coordniate = subImage.getCoordinates().get(i);
+ polygon.addPoint(coordniate.getX(), coordniate.getY());
+ }
+
+ Rectangle bounds = polygon.getBounds();
+
+ BufferedImage bSubImage = new BufferedImage(bounds.width, bounds.height, BufferedImage.TYPE_INT_RGB);
+
+ Graphics2D graphics = bSubImage.createGraphics();
+
+ polygon.translate(-bounds.x, -bounds.y);
+
+ graphics.setClip(polygon);
+ graphics.drawImage(bImage, -bounds.x, -bounds.y, null);
+
+ // Create file
+
+ File tempInputFile = new File("tempInputImage");
+ tempInputFile.deleteOnExit();
+ try (OutputStream stream = new FileOutputStream(tempInputFile)) {
+ stream.write(base64Image);
+ }
+
+ String mimeType = Files.probeContentType(tempInputFile.toPath());
+ String subType = "";
+
+ if(overrideExtension == null){
+ if(mimeType != null && !mimeType.isEmpty()){
+ subType = mimeType.split("/")[1];
+ }else{
+ subType = "jpg";
+ }
+ }else{
+ subType = overrideExtension;
+ if(subType.startsWith(".")){
+ if(subType.length() > 1)
+ subType = subType.substring(1);
+ }
+ }
+
+ File outputFile = new File(getOutputName(jCas, subImage, subType));
+ outputFile.deleteOnExit();
+
+ RenderedImage rendImage = bSubImage;
+ ImageIO.write(rendImage, subType, outputFile);
+
+ subImages.add(outputFile);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ });
+
+ return subImages;
+ }
+
+ private static String getOutputName(JCas jCas, AudioToken audioToken, String format){
+ if(format.startsWith(".")){
+ format = format.substring(1);
+ }
+
+ String documentId = "";
+ if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) {
+ DocumentMetaData meta = DocumentMetaData.get(jCas);
+ documentId = meta.getDocumentId() + "_";
+ }
+
+ System.out.println("OUTPUT FILE: " + documentId + audioToken._id() + "_" + audioToken.getTimeStart() + "-" + audioToken.getTimeEnd() + "." + format);
+ return documentId + audioToken._id() + "_" + audioToken.getTimeStart() + "-" + audioToken.getTimeEnd() + "." + format;
+ }
+
+ private static String getOutputName(JCas jCas, Annotation annotation, String format){
+ if(format.startsWith(".")){
+ format = format.substring(1);
+ }
+
+ String documentId = "";
+ if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) {
+ DocumentMetaData meta = DocumentMetaData.get(jCas);
+ documentId = meta.getDocumentId() + "_";
+ }
+
+ return documentId + annotation._id() + "_" + annotation.getBegin()+ "-" + annotation.getEnd() + "." + format;
+ }
+
+ private static String getOutputName(JCas jCas, SubImage subImage, String format){
+
+ String documentId = "";
+ if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) {
+ DocumentMetaData meta = DocumentMetaData.get(jCas);
+ documentId = meta.getDocumentId() + "_";
+ }
+
+ return documentId + subImage.getParent()._id() + "_" + subImage._id() + "." + format;
+ }
+
+}
diff --git a/src/test/java/TestDUUI.java b/src/test/java/TestDUUI.java
index bef9a138..e8076dcb 100644
--- a/src/test/java/TestDUUI.java
+++ b/src/test/java/TestDUUI.java
@@ -1,11 +1,11 @@
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.SerialFormat;
import org.apache.uima.cas.impl.XmiCasSerializer;
-import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
@@ -30,22 +30,30 @@
import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIPipelineDescription;
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.*;
import org.texttechnologylab.DockerUnifiedUIMAInterface.io.AsyncCollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUIAsynchronousProcessor;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUICollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.DUUIMultimodalCollectionReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.DUUIYouTubeReader;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.io.writer.AudioSegmentWriter;
import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaCommunicationLayer;
import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext;
import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaSandbox;
import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.LuaConsts;
import org.texttechnologylab.DockerUnifiedUIMAInterface.pipeline_storage.DUUIMockStorageBackend;
import org.texttechnologylab.DockerUnifiedUIMAInterface.pipeline_storage.sqlite.DUUISqliteStorageBackend;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.tools.MultimodalUtil;
+import org.texttechnologylab.annotation.type.*;
import org.xml.sax.SAXException;
import javax.script.*;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
+import java.io.*;
import java.net.URISyntaxException;
+import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
+import java.util.HashSet;
+import java.util.Set;
import java.util.stream.Collectors;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
@@ -821,4 +829,370 @@ public void kubernetesTest() throws Exception {
composer.run(pCorpusReader, "sentence");
}
+ @Test
+ public void differentViewsTest() throws Exception{
+
+ ClassLoader classLoader = getClass().getClassLoader();
+ URL resource = classLoader.getResource("hf_key.txt");
+
+ File keyFile = new File(resource.toURI());;
+
+ String content = "";
+ try {
+ content += Files.readAllLines(keyFile.toPath(), StandardCharsets.UTF_8);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ String hfKey = content.substring(1, content.length() - 1);
+
+ JCas aCas = JCasFactory.createJCas();
+
+ File videoFile = new File("D:/DUUIVideos/read/TBBT.mp4");
+ if (videoFile.exists()) {
+ String encoded = org.apache.commons.codec.binary.Base64.encodeBase64String(org.apache.commons.io.FileUtils.readFileToByteArray(videoFile));
+ String mimeType = Files.probeContentType(videoFile.toPath());
+ System.out.println(mimeType);
+ aCas.setSofaDataString(encoded, mimeType);
+ //aCas.setSofaDataString("https://www.youtube.com/watch?v=dEMzzgbm6Ow", mimeType);
+ }else{
+ System.out.println(videoFile.getAbsolutePath() + " not found");
+ return;
+ }
+
+ //aCas.setSofaDataString("https://www.youtube.com/watch?v=8qZYsYq_Ctw", "text/x-uri");
+
+ int iWorkers = 1;
+
+ DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary();
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =)
+ .withLuaContext(ctx) // wir setzen den definierten Kontext
+ .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit.
+
+ DUUIUIMADriver uima_driver = new DUUIUIMADriver();
+ DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+
+ // Hinzufügen der einzelnen Driver zum Composer
+ composer.addDriver(uima_driver, remoteDriver, dockerDriver);
+
+ aCas.setDocumentLanguage("de");
+
+ /*
+ composer.add(new DUUIDockerDriver.Component("duui-yt-dlp:latest") // YT Downloader
+ .withScale(iWorkers)
+ .withTargetView("video_view")
+ .withParameter("withTranscription", "true")
+ .build());
+
+ composer.add(new DUUIDockerDriver.Component("duui-video-to-audio:latest") // Video to audio
+ .withScale(iWorkers)
+ .withSourceView("video_view")
+ .withTargetView("audio_view")
+ .build());
+
+ composer.add(new DUUIDockerDriver.Component("duui-whisper:latest") // Audio to text
+ .withScale(iWorkers)
+ .withSourceView("audio_view")
+ .withTargetView("text_view")
+ .build());
+
+ composer.add(new DUUIDockerDriver.Component("duui-pyannote:latest") // Audio to speaker
+ .withScale(iWorkers)
+ .withSourceView("audio_view")
+ .withParameter("token", hfKey)
+ .withTargetView("text_view")
+ .build()); */
+
+ composer.add(new DUUIDockerDriver.Component("duui-annotheia:latest") // Annotheia
+ .withScale(iWorkers)
+ .withTargetView("text_view")
+ .withName("annotheia")
+ //.withRunningAfterDestroy(true)
+ //.withParameter("device", "cuda")
+ .build());
+
+ composer.add(new DUUIDockerDriver.Component("duui-spacy:latest") // Spacy
+ .withScale(iWorkers)
+ .withView("text_view")
+ .withParameter("use_existing_sentences", "false")
+ .withParameter("use_existing_tokens", "false")
+ .build());
+
+ /*composer.add(new DUUIRemoteDriver.Component("http://localhost:9717") // Audio to speaker
+ .withScale(iWorkers)
+ .withSourceView("audio_view")
+ .withTargetView("text_view")
+ .withParameter("token", hfKey)
+ .withParameter("device", "cuda")
+ .build());
+ */
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp",
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"))
+ .build());
+
+ composer.run(aCas);
+
+ MultimodalUtil.getAllCoveredVideo(aCas.getView("text_view"), aCas, Sentence.class, "mp4").forEach(file -> {
+ try {
+ FileUtils.moveFile(new File(file.getAbsolutePath()), new File("C:/test/" + file.getName()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ );
+ }
+
+ @Test
+ public void youtubeReaderTest() throws Exception{
+
+ ClassLoader classLoader = getClass().getClassLoader();
+ URL resource = classLoader.getResource("hf_key.txt");
+
+ File file = new File(resource.toURI());;
+
+ String content = "";
+ try {
+ content += Files.readAllLines(file.toPath(), StandardCharsets.UTF_8);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ String hfKey = content.substring(1, content.length() - 1);
+
+ //CasIOUtils.save(aCas.getCas(), new FileOutputStream(new File("/tmp/audiotest.xmi")), SerialFormat.XMI_1_1);
+ int iWorkers = 1;
+
+ DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary();
+
+ // Instanziierung des Composers, mit einigen Parametern
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =)
+ .withLuaContext(ctx) // wir setzen den definierten Kontext
+ .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit.
+
+ DUUIUIMADriver uima_driver = new DUUIUIMADriver();
+ DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver();
+
+
+ //DUUIYouTubeReader ytReader = new DUUIYouTubeReader("https://www.youtube.com/@Jules1/videos", "AIzaSyDycLCdJ1_jfkFL-pWnQuf1FzluJbX21Bw");
+ //DUUIYouTubeReader ytReader = new DUUIYouTubeReader("https://www.youtube.com/watch?v=SV6NJ6PcGBs&list=PLh19WWr20745LHdlDAg2P_JT7I2Wx6axP", "AIzaSyDycLCdJ1_jfkFL-pWnQuf1FzluJbX21Bw");
+ DUUIMultimodalCollectionReader multiReader = new DUUIMultimodalCollectionReader("D:/DUUIVideos/read", "mp4");
+
+ Set readers = new HashSet<>();
+
+ //readers.add(ytReader);
+ readers.add(multiReader);
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(readers);
+
+ // Hinzufügen der einzelnen Driver zum Composer
+ composer.addDriver(uima_driver, remoteDriver);
+
+ /*composer.add(new DUUIRemoteDriver.Component("http://localhost:9713") // Youtube downloader
+ .withScale(iWorkers)
+ .withTargetView("video_view")
+ .withParameter("withTranscription", "false")
+ .build());
+
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9714") // Video to audio
+ .withScale(iWorkers)
+ .withSourceView("video_view")
+ .withTargetView("audio_view")
+ .build());
+
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9715") // Audio to text
+ .withScale(iWorkers)
+ .withSourceView("audio_view")
+ .withTargetView("text_view")
+ .withParameter("device", "cuda")
+ .build());*/
+
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9717") // Annotheia
+ .withScale(iWorkers)
+ .withTargetView("text_view")
+ //.withParameter("token", hfKey)
+ .withParameter("device", "cuda")
+ .build());
+
+ /*composer.add(new DUUIRemoteDriver.Component("http://localhost:9720") // Spacy
+ .withScale(iWorkers)
+ .withSourceView("text_view")
+ .withTargetView("text_view")
+ .withParameter("use_existing_sentences", "false")
+ .withParameter("use_existing_tokens", "false")
+ .build());*/
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp",
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"))
+ .build());
+
+ /*composer.add(new DUUIUIMADriver.Component(createEngineDescription(AudioSegmentWriter.class,
+ AudioSegmentWriter.PARAM_TARGET_LOCATION, "C:/test",
+ AudioSegmentWriter.PARAM_AUDIO_CONTENT_VIEW, "audio_view",
+ AudioSegmentWriter.PARAM_AUDIO_TOKEN_VIEW, "text_view"))
+ .build()); */
+
+
+ composer.run(processor, "test");
+
+
+ //composer.run(aCas);
+ }
+
+ @Test
+ public void multimodalFileReaderTest() throws Exception{
+
+
+ int iWorkers = 1;
+
+ DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary();
+
+ // Instanziierung des Composers, mit einigen Parametern
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =)
+ .withLuaContext(ctx) // wir setzen den definierten Kontext
+ .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit.
+
+ DUUIUIMADriver uima_driver = new DUUIUIMADriver();
+ DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+
+
+ DUUIMultimodalCollectionReader multiReader = new DUUIMultimodalCollectionReader("D:/DUUIVideos/read", "gz");
+
+ Set readers = new HashSet<>();
+
+ readers.add(multiReader);
+
+ DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(readers);
+
+ // Hinzufügen der einzelnen Driver zum Composer
+ composer.addDriver(uima_driver, remoteDriver, dockerDriver);
+ /*
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9713") // Youtube downloader
+ .withScale(iWorkers)
+ .withTargetView("video_view")
+ .withParameter("withTranscription", "false")
+ .build());
+
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9714") // Video to audio
+ .withScale(iWorkers)
+ .withSourceView("video_view")
+ .withTargetView("audio_view")
+ .build());
+
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9715") // Audio to text
+ .withScale(iWorkers)
+ .withSourceView("audio_view")
+ .withTargetView("text_view")
+ .withParameter("device", "cuda")
+ .build());
+
+ composer.add(new DUUIRemoteDriver.Component("http://localhost:9717") // Audio to speaker
+ .withScale(iWorkers)
+ .withSourceView("audio_view")
+ .withTargetView("text_view")
+ .withParameter("token", hfKey)
+ .withParameter("device", "cuda")
+ .build());*/
+
+ /* composer.add(new DUUIDockerDriver.Component("duui-spacy:latest") // Spacy
+ .withScale(iWorkers)
+ .withView("text_view")
+ .withRunningAfterDestroy(true)
+ .withParameter("use_existing_sentences", "false")
+ .withParameter("use_existing_tokens", "false")
+ .build());
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp",
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"))
+ .build());*/
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(AudioSegmentWriter.class,
+ AudioSegmentWriter.PARAM_TARGET_LOCATION, "C:/test",
+ //AudioSegmentWriter.PARAM_AUDIO_CONTENT_VIEW, "text_view",
+ AudioSegmentWriter.PARAM_AUDIO_TOKEN_VIEW, "text_view"))
+ .build());
+
+
+ composer.run(processor, "test");
+
+ //composer.run(aCas);
+ }
+
+
+
+ @Test
+ public void multimodalImageCutterTest() throws Exception{
+
+ JCas aCas = JCasFactory.createJCas();
+
+ File videoFile = new File("D:/DUUIVideos/read/India_Street.jpg");
+ if (videoFile.exists()) {
+ String encoded = org.apache.commons.codec.binary.Base64.encodeBase64String(org.apache.commons.io.FileUtils.readFileToByteArray(videoFile));
+ String mimeType = Files.probeContentType(videoFile.toPath());
+ System.out.println(mimeType);
+ aCas.setSofaDataString(encoded, mimeType);
+ }else{
+ System.out.println(videoFile.getAbsolutePath() + " not found");
+ return;
+ }
+
+ int iWorkers = 1;
+
+ DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary();
+
+ DUUIComposer composer = new DUUIComposer()
+ .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =)
+ .withLuaContext(ctx) // wir setzen den definierten Kontext
+ .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit.
+
+ DUUIUIMADriver uima_driver = new DUUIUIMADriver();
+ DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver();
+ DUUIDockerDriver dockerDriver = new DUUIDockerDriver();
+
+ // Hinzufügen der einzelnen Driver zum Composer
+ composer.addDriver(uima_driver, remoteDriver, dockerDriver);
+
+ composer.add(new DUUIDockerDriver.Component("duui-yolo:latest") // Image detection
+ .withScale(iWorkers)
+ .withRunningAfterDestroy(true)
+ .build());
+
+ composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp",
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1",
+ XmiWriter.PARAM_COMPRESSION, "GZIP"))
+ .build());
+
+ composer.run(aCas);
+
+ MultimodalUtil.getSubImages(aCas).forEach(file -> {
+ try {
+ FileUtils.moveFile(new File(file.getAbsolutePath()), new File("C:/test/" + file.getName()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ });
+
+ //composer.run(aCas);
+ }
+
}