diff --git a/pom.xml b/pom.xml index d7e7cb91..22ae8eb0 100644 --- a/pom.xml +++ b/pom.xml @@ -607,6 +607,21 @@ pom + + org.bytedeco + javacv + 1.5.7 + + + org.bytedeco + javacv-platform + 1.5.7 + + + org.bytedeco + ffmpeg-platform + 6.1.1-1.5.10 + org.jsoup jsoup diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java index 6f9ad210..06a3c214 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIComposer.java @@ -450,6 +450,7 @@ public void run() { // TODO thread safety needed for here? DUUISegmentationStrategy segmentationStrategy = i.getSegmentationStrategy(); if (segmentationStrategy instanceof DUUISegmentationStrategyNone) { + i.getDriver().run(i.getUUID(), _jc, perf, composer); } else { segmentationStrategy.initialize(_jc); diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java index cdd7c859..8f429aec 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/DUUIFallbackCommunicationLayer.java @@ -1,6 +1,7 @@ package org.texttechnologylab.DockerUnifiedUIMAInterface; import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.cas.CASException; import org.apache.uima.cas.impl.XmiCasDeserializer; import org.apache.uima.cas.impl.XmiCasSerializer; import org.apache.uima.jcas.JCas; @@ -15,10 +16,10 @@ import java.util.Map; public class DUUIFallbackCommunicationLayer implements IDUUICommunicationLayer { - public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters) throws CompressorException, IOException, SAXException { + public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters, String sourceView) throws CompressorException, IOException, SAXException, CASException { JSONObject obj = new JSONObject(); ByteArrayOutputStream arr = new ByteArrayOutputStream(); - XmiCasSerializer.serialize(jc.getCas(), null, arr); + XmiCasSerializer.serialize(jc.getView(sourceView).getCas(), null, arr); StringWriter writer = new StringWriter(); TypeSystemUtil.typeSystem2TypeSystemDescription(jc.getTypeSystem()).toXML(writer); @@ -34,7 +35,7 @@ public void serialize(JCas jc, ByteArrayOutputStream out, Map par out.write(obj.toString().getBytes(StandardCharsets.UTF_8)); } - public void deserialize(JCas jc, ByteArrayInputStream input) throws IOException, SAXException { + public void deserialize(JCas jc, ByteArrayInputStream input, String targetView) throws IOException, SAXException { String body = new String(input.readAllBytes(), Charset.defaultCharset()); JSONObject response = new JSONObject(body); if (response.has("cas") || response.has("error")) { diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java index 51d92157..7d0da09f 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/IDUUICommunicationLayer.java @@ -1,6 +1,7 @@ package org.texttechnologylab.DockerUnifiedUIMAInterface; import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.cas.CASException; import org.apache.uima.jcas.JCas; import org.xml.sax.SAXException; @@ -12,6 +13,11 @@ * Interface for communication between the DUUI composer {@link DUUIComposer} and the components {@link org.texttechnologylab.DockerUnifiedUIMAInterface.driver.IDUUIDriverInterface}. */ public interface IDUUICommunicationLayer { + + public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters, String sourceView) throws CompressorException, IOException, SAXException, CASException; + + public void deserialize(JCas jc, ByteArrayInputStream input, String targetView) throws IOException, SAXException, CASException; + /** * Serializes a JCas to a byte array output stream by using the LUA script provided by the component. * @param jc Input JCas. diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java index 37dc0e02..da41cd0c 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/composer/DUUISegmentedWorker.java @@ -73,9 +73,11 @@ public void run() { try { DUUIPipelineDocumentPerformance perf = new DUUIPipelineDocumentPerformance(name, waitTimeEnd - waitTimeStart, jCas, trackErrorDocs); + pipelinePart.getDriver().run(pipelinePart.getUUID(), jCas, perf, null); // TODO!!!! @Daniel //pipelinePart.getDriver().run(pipelinePart.getUUID(), jCas, perf); + if (backend != null) { backend.addMetricsForDocument(perf); } diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java index 3d6aa147..ad307928 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIDockerDriver.java @@ -198,7 +198,7 @@ public static IDUUICommunicationLayer responsiveAfterTime(String url, JCas jc, i ByteArrayOutputStream stream = new ByteArrayOutputStream(); try { //TODO: Make this accept options to better check the instantiation! - layer.serialize(jc, stream, null); + layer.serialize(jc, stream, null, "_InitialView"); } catch (Exception e) { e.printStackTrace(); throw new Exception(format("The serialization step of the communication layer fails for implementing class %s", layer.getClass().getCanonicalName())); @@ -213,7 +213,7 @@ public static IDUUICommunicationLayer responsiveAfterTime(String url, JCas jc, i if (resp.statusCode() == 200) { ByteArrayInputStream inputStream = new ByteArrayInputStream(resp.body()); try { - layer.deserialize(jc, inputStream); + layer.deserialize(jc, inputStream, "_InitialView"); } catch (Exception e) { System.err.printf("Caught exception printing response %s\n", new String(resp.body(), StandardCharsets.UTF_8)); throw e; @@ -493,6 +493,8 @@ static class InstantiatedComponent implements IDUUIInstantiatedPipelineComponent private String _reg_username; private String _uniqueComponentKey; private Map _parameters; + private String _sourceView; + private String _targetView; private DUUIPipelineComponent _component; @@ -514,6 +516,8 @@ public void addComponent(IDUUIUrlAccessible access) { _component = comp; _image_name = comp.getDockerImageName(); _parameters = comp.getParameters(); + _targetView = comp.getTargetView(); + _sourceView = comp.getSourceView(); if (_image_name == null) { throw new InvalidParameterException("The image name was not set! This is mandatory for the DockerLocalDriver Class."); } @@ -585,6 +589,10 @@ public Map getParameters() { return _parameters; } + public String getSourceView() {return _sourceView; } + + public String getTargetView() {return _targetView; } + public boolean isWebsocket() { return _websocket; } @@ -602,6 +610,21 @@ public Component withParameter(String key, String value) { return this; } + public Component withView(String viewName) { + _component.withView(viewName); + return this; + } + + public Component withSourceView(String viewName) { + _component.withSourceView(viewName); + return this; + } + + public Component withTargetView(String viewName) { + _component.withTargetView(viewName); + return this; + } + public Component(String target) throws URISyntaxException, IOException { _component = new DUUIPipelineComponent(); _component.withDockerImageName(target); diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java index 2be1c825..c253d8de 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIKubernetesDriver.java @@ -439,6 +439,8 @@ static class InstantiatedComponent implements IDUUIInstantiatedPipelineComponent private int _scale; private boolean _withImageFetching; private Map _parameters; + private String _sourceView; + private String _targetView; private DUUIPipelineComponent _component; private final boolean _websocket; @@ -451,6 +453,8 @@ static class InstantiatedComponent implements IDUUIInstantiatedPipelineComponent _component = comp; _image_name = comp.getDockerImageName(); _parameters = comp.getParameters(); + _targetView = comp.getTargetView(); + _sourceView = comp.getSourceView(); if (_image_name == null) { throw new InvalidParameterException("The image name was not set! This is mandatory for the DockerLocalDriver Class."); } @@ -523,6 +527,10 @@ public Map getParameters() { return _parameters; } + public String getSourceView() {return _sourceView; } + + public String getTargetView() {return _targetView; } + @Override public String getUniqueComponentKey() { return _uniqueComponentKey; @@ -633,6 +641,21 @@ public Component withParameter(String key, String value) { return this; } + public Component withView(String viewName) { + _component.withView(viewName); + return this; + } + + public Component withSourceView(String viewName) { + _component.withSourceView(viewName); + return this; + } + + public Component withTargetView(String viewName) { + _component.withTargetView(viewName); + return this; + } + /** * Builds the component. * diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java index b0e87e1f..ad8bf111 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIPipelineComponent.java @@ -65,6 +65,8 @@ public class DUUIPipelineComponent { private static String driverName = "driver"; private static String descriptionName = "description"; + private static String sourceView = "sourceView"; + private static String targetView = "targetView"; private String getVersion() throws URISyntaxException, IOException { ClassLoader classLoader = DUUIPipelineComponent.class.getClassLoader(); @@ -80,6 +82,7 @@ public DUUIPipelineComponent() throws URISyntaxException, IOException { _options = new HashMap<>(); _finalizedEncoded = null; _parameters = new HashMap<>(); + String version = getVersion(); if(version == null) { _options.put(versionInformation,"Unknown"); @@ -487,6 +490,22 @@ public DUUIPipelineComponent withParameter(String key, String value) { return this; } + public DUUIPipelineComponent withView(String viewName){ + withSourceView(viewName); + withTargetView(viewName); + return this; + } + + public DUUIPipelineComponent withSourceView(String viewName) { + _options.put(sourceView, viewName); + return this; + } + + public DUUIPipelineComponent withTargetView(String viewName) { + _options.put(targetView, viewName); + return this; + } + public static DUUIPipelineComponent fromJson(String json) throws URISyntaxException, IOException { JSONObject jobj = new JSONObject(json); @@ -560,6 +579,22 @@ public final Map getParameters() { return _parameters; } + public String getSourceView() { + String result = _options.get(sourceView); + if(result == null) { + return "_InitialView"; + } + return result; + } + + public String getTargetView() { + String result = _options.get(targetView); + if(result == null) { + return "_InitialView"; + } + return result; + } + public DUUIPipelineComponent clearParameters() { _parameters.clear(); return this; diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java index b50442cf..4a8d9b97 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUIRemoteDriver.java @@ -81,6 +81,21 @@ public Component withParameter(String key, String value) { return this; } + public Component withView(String viewName) { + component.withView(viewName); + return this; + } + + public Component withSourceView(String viewName) { + component.withSourceView(viewName); + return this; + } + + public Component withTargetView(String viewName) { + component.withTargetView(viewName); + return this; + } + public Component withWebsocket(boolean b) { component.withWebsocket(b); return this; @@ -152,6 +167,8 @@ private static class InstantiatedComponent implements IDUUIInstantiatedPipelineC private ConcurrentLinkedQueue _components; private String _uniqueComponentKey; private Map _parameters; + private String _sourceView; + private String _targetView; private DUUIPipelineComponent _component; private boolean _websocket; private int _ws_elements; @@ -179,6 +196,8 @@ public InstantiatedComponent(DUUIPipelineComponent comp) { } _parameters = comp.getParameters(); + _targetView = comp.getTargetView(); + _sourceView = comp.getSourceView(); _uniqueComponentKey = ""; @@ -208,6 +227,10 @@ public Map getParameters() { return _parameters; } + public String getSourceView() {return _sourceView; } + + public String getTargetView() {return _targetView; } + public boolean isWebsocket() { return _websocket; } diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java index a60f9226..b0773551 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/DUUISwarmDriver.java @@ -264,6 +264,8 @@ private static class InstantiatedComponent implements IDUUIInstantiatedPipelineC private final String _reg_password; private final String _reg_username; private final Map _parameters; + private String _sourceView; + private String _targetView; private DUUIPipelineComponent _component; @@ -275,6 +277,8 @@ private static class InstantiatedComponent implements IDUUIInstantiatedPipelineC } _parameters = comp.getParameters(); + _targetView = comp.getTargetView(); + _sourceView = comp.getSourceView(); _scale = comp.getScale(1); _constraints.addAll(comp.getConstraints()); _components = new ConcurrentLinkedQueue<>(); @@ -374,6 +378,9 @@ public Map getParameters() { return _parameters; } + public String getSourceView() {return _sourceView; } + + public String getTargetView() {return _targetView; } public Triplet getComponent() { long mutexStart = System.nanoTime(); @@ -412,6 +419,21 @@ public Component withParameter(String key, String value) { return this; } + public Component withView(String viewName) { + component.withView(viewName); + return this; + } + + public Component withSourceView(String viewName) { + component.withSourceView(viewName); + return this; + } + + public Component withTargetView(String viewName) { + component.withTargetView(viewName); + return this; + } + public Component withScale(int scale) { component.withScale(scale); return this; diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java index d71513ad..54febe69 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/driver/IDUUIInstantiatedPipelineComponent.java @@ -41,6 +41,8 @@ public interface IDUUIInstantiatedPipelineComponent { public void addComponent(IDUUIUrlAccessible item); public Map getParameters(); + public String getSourceView(); + public String getTargetView(); public String getUniqueComponentKey(); public static TypeSystemDescription getTypesystem(String uuid, IDUUIInstantiatedPipelineComponent comp) throws ResourceInitializationException { @@ -110,7 +112,7 @@ public static void process(JCas jc, IDUUIInstantiatedPipelineComponent comp, DUU } } - layer.serialize(viewJc,out,comp.getParameters()); + layer.serialize(viewJc,out,comp.getParameters(), comp.getSourceView()); // lua serialize call() byte[] ok = out.toByteArray(); @@ -147,7 +149,7 @@ public static void process(JCas jc, IDUUIInstantiatedPipelineComponent comp, DUU long deserializeStart = annotatorEnd; try { - layer.deserialize(viewJc, st); + layer.deserialize(viewJc, st, comp.getTargetView()); } catch(Exception e) { System.err.printf("Caught exception printing response %s\n",new String(resp.body(), StandardCharsets.UTF_8)); @@ -228,7 +230,7 @@ public static void process_handler(JCas jc, } } // lua serialize call() - layer.serialize(viewJc,out,comp.getParameters()); + layer.serialize(viewJc,out,comp.getParameters(), comp.getSourceView()); // ok is the message. byte[] ok = out.toByteArray(); @@ -265,7 +267,7 @@ public static void process_handler(JCas jc, * Merging results before deserializing. */ result = layer.merge(results); - layer.deserialize(finalViewJc, result); + layer.deserialize(finalViewJc, result, comp.getTargetView()); } catch(Exception e) { e.printStackTrace(); diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIMultimodalCollectionReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIMultimodalCollectionReader.java new file mode 100644 index 00000000..77981c06 --- /dev/null +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIMultimodalCollectionReader.java @@ -0,0 +1,494 @@ +package org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.uima.cas.impl.XmiCasDeserializer; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.javaync.io.AsyncFiles; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.AsyncCollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUICollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.monitoring.AdvancedProgressMeter; +import org.texttechnologylab.utilities.helper.StringUtils; +import org.xml.sax.SAXException; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Random; +import java.util.Scanner; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +public class DUUIMultimodalCollectionReader implements DUUICollectionReader { + + private String _path; + private ConcurrentLinkedQueue _filePaths; + private ConcurrentLinkedQueue _filePathsBackup; + private ConcurrentLinkedQueue _loadedFiles; + + private String _viewName; + + private int _initialSize; + private AtomicInteger _docNumber; + private long _maxMemory; + private AtomicLong _currentMemorySize; + + private boolean _addMetadata = true; + + private String _targetPath = null; + + private String _language = null; + + private AdvancedProgressMeter progress = null; + + private int debugCount = 25; + + private String targetLocation = null; + + public DUUIMultimodalCollectionReader(String folder, String ending) { + this(folder, ending, "_InitialView", 25, getRandomFromMode(null, -1), getSortFromMode(null), "", true, null, 0, "", null); + } + + public DUUIMultimodalCollectionReader(String folder, String ending, String viewName) { + this(folder, ending, viewName, 25, getRandomFromMode(null, -1), getSortFromMode(null), "", true, null, 0, "", null); + } + + public DUUIMultimodalCollectionReader(String folder, String ending, String viewName, int debugCount, int iRandom, boolean bSort, String savePath, boolean bAddMetadata, String language, int skipSmallerFiles, String targetLocation, String targetEnding) { + this.targetLocation = targetLocation; + _addMetadata = bAddMetadata; + _language = language; + _filePaths = new ConcurrentLinkedQueue<>(); + _loadedFiles = new ConcurrentLinkedQueue<>(); + _filePathsBackup = new ConcurrentLinkedQueue<>(); + _viewName = viewName; + + if (new File(savePath).exists() && savePath.length() > 0) { + File sPath = new File(savePath); + + String sContent = null; + try { + sContent = StringUtils.getContent(sPath); + } catch (IOException e) { + e.printStackTrace(); + } + String[] sSplit = sContent.split("\n"); + + for (String s : sSplit) { + _filePaths.add(s); + } + + } else { + File fl = new File(folder); + if (!fl.isDirectory()) { + throw new RuntimeException("The folder is not a directory!"); + } + + + _path = folder; + addFilesToConcurrentList(fl, ending, _filePaths); + + if (skipSmallerFiles > 0) { + _filePaths = skipBySize(_filePaths, skipSmallerFiles); + } + } + + + if (skipSmallerFiles > 0) { + _filePaths = skipBySize(_filePaths, skipSmallerFiles); + } + + if (bSort) { + _filePaths = sortBySize(_filePaths); + } + + if (bSort && iRandom > 0) { + System.out.println("Sorting and Random Selection is active, using the " + (iRandom > 0 ? "largest " : "smallest ") + Math.abs(iRandom) + " documents."); +// _filePaths = takeFirstOrLast(_filePaths, iRandom); + } else if (iRandom > 0) { + _filePaths = random(_filePaths, iRandom); + } + + if (savePath.length() > 0) { + File nFile = new File(savePath); + + if (!nFile.exists()) { + StringBuilder sb = new StringBuilder(); + _filePaths.forEach(f -> { + if (sb.length() > 0) { + sb.append("\n"); + } + sb.append(f); + }); + try { + StringUtils.writeContent(sb.toString(), nFile); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + // remove files that are already in the target location + // NOTE we do this after saving the file list, as we do not want to change anything but only avoid processing files multiple times + if (this.targetLocation != null) { + // _filePaths = removeIfInTarget(_filePaths, this.targetLocation, targetEnding, this._path, ending); + } + + _filePathsBackup.addAll(_filePaths); + + this.debugCount = debugCount; + + System.out.printf("Found %d files matching the pattern! \t Using Random: %d\n", _filePaths.size(), iRandom); + _initialSize = _filePaths.size(); + _docNumber = new AtomicInteger(0); + _currentMemorySize = new AtomicLong(0); + // 500 MB + _maxMemory = 500 * 1024 * 1024; + + progress = new AdvancedProgressMeter(_initialSize); + } + + private static int getRandomFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE sampleMode, int sampleSize) { + if (sampleMode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.SMALLEST) { + return sampleSize * -1; + } + return sampleSize; + } + + private static boolean getSortFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE mode) { + if (mode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.RANDOM) { + return false; + } + return true; + } + + public static void addFilesToConcurrentList(File folder, String ending, ConcurrentLinkedQueue paths) { + File[] listOfFiles = folder.listFiles(); + + for (int i = 0; i < listOfFiles.length; i++) { + if (listOfFiles[i].isFile()) { + if (listOfFiles[i].getName().endsWith(ending)) { + paths.add(listOfFiles[i].getPath().toString()); + } + } else if (listOfFiles[i].isDirectory()) { + addFilesToConcurrentList(listOfFiles[i], ending, paths); + } + } + + } + + public static ConcurrentLinkedQueue sortBySize(ConcurrentLinkedQueue paths) { + + ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue(); + + rQueue.addAll(paths.stream().sorted((s1, s2) -> { + Long firstLength = new File(s1).length(); + Long secondLength = new File(s2).length(); + + return firstLength.compareTo(secondLength) * -1; + }).collect(Collectors.toList())); + + return rQueue; + + } + + /** + * Skips files smaller than skipSmallerFiles + * + * @param paths paths to files + * @param skipSmallerFiles skip files smaller than this value in bytes + * @return filtered paths to files + */ + public static ConcurrentLinkedQueue skipBySize(ConcurrentLinkedQueue paths, int skipSmallerFiles) { + ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue<>(); + + System.out.println("Skip files smaller than " + skipSmallerFiles + " bytes"); + System.out.println(" Number of files before skipping: " + paths.size()); + + rQueue.addAll(paths + .stream() + .filter(s -> new File(s).length() >= skipSmallerFiles) + .collect(Collectors.toList()) + ); + + System.out.println(" Number of files after skipping: " + rQueue.size()); + + return rQueue; + } + + public static ConcurrentLinkedQueue random(ConcurrentLinkedQueue paths, int iRandom) { + + ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue(); + + Random nRandom = new Random(iRandom); + + ArrayList sList = new ArrayList<>(); + sList.addAll(paths); + + Collections.shuffle(sList, nRandom); + + if (iRandom > sList.size()) { + rQueue.addAll(sList.subList(0, sList.size())); + } else { + rQueue.addAll(sList.subList(0, iRandom)); + } + + + return rQueue; + + } + + + public static String getSize(String sPath) { + return FileUtils.byteCountToDisplaySize(new File(sPath).length()); + } + + @Override + public AdvancedProgressMeter getProgress() { + return this.progress; + } + + @Override + public void getNextCas(JCas empty) { + ByteReadFuture future = _loadedFiles.poll(); + + byte[] bFile = null; + String result = null; + if (future == null) { + result = _filePaths.poll(); + if (result == null) return; + } else { + result = future.getPath(); + bFile = future.getBytes(); + long factor = 1; + if (result.endsWith(".gz") || result.endsWith(".xz")) { + factor = 10; + } + _currentMemorySize.getAndAdd(-factor * (long) bFile.length); + } + int val = _docNumber.addAndGet(1); + + progress.setDone(val); + progress.setLeft(_initialSize - val); + + if (_initialSize - progress.getCount() > debugCount) { + if (val % debugCount == 0 || val == 0) { + System.out.printf("%s: \t %s \t %s\n", progress, getSize(result), result); + } + } else { + System.out.printf("%s: \t %s \t %s\n", progress, getSize(result), result); + } + + if (bFile == null) { + try { + bFile = Files.readAllBytes(Path.of(result)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + try { + /* + if (result.endsWith(".xz")) { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + decodedFile = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.XZ, new ByteArrayInputStream(file)); + } else if (result.endsWith(".gz")) { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + decodedFile = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.GZIP, new ByteArrayInputStream(file)); + } else { + decodedFile = new ByteArrayInputStream(file); + } + */ + + empty.reset(); + + JCas mView; + try { + mView = empty.getView(_viewName); + + }catch (Exception e){ + mView = empty.createView(_viewName); + } + + var parts = result.split("\\."); + String fileExtension = parts[parts.length - 1]; + + File fFile = new File(result); + String mimeType = Files.probeContentType(fFile.toPath()); + + + if(mimeType == null){ + if(fileExtension.equals("xmi")){ + mimeType = "application/xmi"; + } + } + + System.out.println(mimeType); + + String sofaString = ""; + + switch(mimeType.split("/")[0]){ + case "image": + case "video": + case "audio": + sofaString = Base64.encodeBase64String(FileUtils.readFileToByteArray(fFile)); + mView.setSofaDataString(sofaString, mimeType); + break; + case "text": + sofaString = readFile(fFile); + mView.setSofaDataString(sofaString, mimeType); + + break; + case "application": + if(fileExtension.equals("xmi")) { + InputStream decodedFile = new ByteArrayInputStream(Files.readAllBytes(fFile.toPath())); + XmiCasDeserializer.deserialize(decodedFile, mView.getCas(), true); + break; + } + else if(mimeType.split("/")[1].equals("x-gzip")){ + CompressorInputStream decodedFile = new CompressorStreamFactory(true).createCompressorInputStream(CompressorStreamFactory.GZIP, new ByteArrayInputStream(Files.readAllBytes(fFile.toPath()))); + XmiCasDeserializer.deserialize(decodedFile, mView.getCas(), true); + break; + }else if(mimeType.split("/")[1].equals("x-xz")){ + CompressorInputStream decodedFile = new CompressorStreamFactory(true).createCompressorInputStream(CompressorStreamFactory.XZ, new ByteArrayInputStream(Files.readAllBytes(fFile.toPath()))); + XmiCasDeserializer.deserialize(decodedFile, mView.getCas(), true); + break; + } + + sofaString = Base64.encodeBase64String(FileUtils.readFileToByteArray(fFile)); + mView.setSofaDataString(sofaString, mimeType); + break; + default: + try{ + sofaString = readFile(fFile); + }catch(Exception e){ + sofaString = Base64.encodeBase64String(FileUtils.readFileToByteArray(fFile)); + } + mView.setSofaDataString(sofaString, mimeType); + break; + } + ; + + } catch (Exception e) { + e.printStackTrace(); + } + + + if (_addMetadata) { + if (JCasUtil.select(empty, DocumentMetaData.class).size() == 0) { + DocumentMetaData dmd = DocumentMetaData.create(empty); + File pFile = new File(result); + dmd.setDocumentId(pFile.getName()); + dmd.setDocumentTitle(pFile.getName()); + dmd.setDocumentUri(pFile.getAbsolutePath()); + dmd.addToIndexes(); + } + } + + if (_language != null && !_language.isEmpty()) { + empty.setDocumentLanguage(_language); + } + + } + + public void reset() { + _filePaths = _filePathsBackup; + _docNumber.set(0); + progress = new AdvancedProgressMeter(_initialSize); + } + + @Override + public boolean hasNext() { + return _filePaths.size() > 0; + } + + @Override + public long getSize() { + return _filePaths.size(); + } + + public CompletableFuture getAsyncNextByteArray() throws IOException, CompressorException, SAXException { + String result = _filePaths.poll(); + if (result == null) return CompletableFuture.completedFuture(1); + CompletableFuture val = AsyncFiles + .readAllBytes(Paths.get(result), 1024 * 1024 * 5) + .thenApply(bytes -> { + _loadedFiles.add(new ByteReadFuture(result, bytes)); + + //Calculate estimated unpacked size by using a compression ratio of 0.1 + long factor = 1; + if (result.endsWith(".gz") || result.endsWith(".xz")) { + factor = 10; + } + _currentMemorySize.getAndAdd(factor * (long) bytes.length); + return 0; + }); + return val; + } + + @Override + public long getDone() { + return _docNumber.get(); + } + + public String formatSize(long lSize) { + + int u = 0; + for (; lSize > 1024 * 1024; lSize >>= 10) { + u++; + } + if (lSize > 1024) + u++; + return String.format("%.1f %cB", lSize / 1024f, " kMGTPE".charAt(u)); + + } + + public enum DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE { + RANDOM, + SMALLEST, + LARGEST + } + + private String readFile(File file) throws FileNotFoundException { + String result = ""; + Scanner myReader = new Scanner(file); + while (myReader.hasNextLine()) { + if(result == ""){ + result = myReader.nextLine(); + }else{ + result += "\n" + myReader.nextLine(); + } + } + + return result; + } + + class ByteReadFuture { + private String _path; + private byte[] _bytes; + + public ByteReadFuture(String path, byte[] bytes) { + _path = path; + _bytes = bytes; + } + + public String getPath() { + return _path; + } + + public byte[] getBytes() { + return _bytes; + } + } +} \ No newline at end of file diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java index b661deff..167181cc 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUISegmentationReader.java @@ -364,7 +364,7 @@ public boolean finishedLoading() { @Override public AdvancedProgressMeter getProgress() { - return this.progress; + return progress; } @Override diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIYouTubeReader.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIYouTubeReader.java new file mode 100644 index 00000000..3c1dbd3b --- /dev/null +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/reader/DUUIYouTubeReader.java @@ -0,0 +1,607 @@ +package org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import org.apache.commons.io.FileUtils; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSList; +import org.json.JSONArray; +import org.json.JSONObject; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.AsyncCollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUICollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.monitoring.AdvancedProgressMeter; +import org.texttechnologylab.annotation.socialmedia.metadata.YouTube; +import org.texttechnologylab.annotation.socialmedia.metadata.youtube.Playlist; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +public class DUUIYouTubeReader implements DUUICollectionReader { + + private String _path; + private ConcurrentLinkedQueue _youtubeVideos; + private ConcurrentLinkedQueue _youtubeVideosBackup; + + private String _viewName; + + private int _initialSize; + private AtomicInteger _docNumber; + private long _maxMemory; + private AtomicLong _currentMemorySize; + + private boolean _addMetadata = true; + + private String _language = null; + + private AdvancedProgressMeter progress = null; + + private int debugCount = 25; + + private Map> _videosPlaylists; + private String _apiKey; + + public DUUIYouTubeReader(String youtubeLink, String apiKey) throws IOException, InterruptedException { + this(youtubeLink, apiKey, "_InitialView", 25, getRandomFromMode(null, -1), true, null); + } + + public DUUIYouTubeReader(String youtubeLink, String apiKey, String viewName) throws IOException, InterruptedException { + this(youtubeLink, apiKey, viewName, 25, getRandomFromMode(null, -1), true, null); + } + + public DUUIYouTubeReader(String youtubeLink, String apiKey, String viewName, int debugCount, int iRandom, boolean bAddMetadata, String language) throws IOException, InterruptedException { + _addMetadata = bAddMetadata; + _language = language; + _youtubeVideos = new ConcurrentLinkedQueue<>(); + _youtubeVideosBackup = new ConcurrentLinkedQueue<>(); + _videosPlaylists = new HashMap<>(); + _apiKey = apiKey; + _viewName = viewName; + + if(youtubeLink.contains("&list=")) { // Is playlist + + String[] parameters = youtubeLink.split("&"); + + String playlistId = ""; + for (String parameter : parameters) { + if (parameter.startsWith("list=")) { + playlistId = parameter.substring(5); + break; + } + } + + try { + String pageToken = ""; + + do{ + List pagedVideos = new LinkedList<>(); + + JSONObject jsonObject = getPlaylistVideos(playlistId, pageToken); + + JSONArray jsonItems = jsonObject.getJSONArray("items"); + + for (int i = 0; i < jsonItems.length(); i++) { + String videoId = jsonItems.getJSONObject(i).getJSONObject("contentDetails").getString("videoId"); + pagedVideos.add(new YouTubeVideo(videoId)); + + _videosPlaylists.put(videoId, Arrays.asList(playlistId)); + } + + if(_addMetadata){ + generateBulkMetadata(pagedVideos); + } + + if(jsonObject.has("nextPageToken")) + pageToken = jsonObject.getString("nextPageToken"); + else + pageToken = ""; + + _youtubeVideos.addAll(pagedVideos); + }while(!pageToken.equals("")); + + } catch (Exception e) { + throw e; + } + }else if(youtubeLink.contains("watch?v")) { // Is single video + youtubeLink = youtubeLink.split("watch\\?v=")[1].split("&")[0]; + + YouTubeVideo video = new YouTubeVideo(youtubeLink); + _youtubeVideos.add(video); + + if(_addMetadata){ + generateMetadata(video); + } + }else if(youtubeLink.contains("youtu.be/")){ // Is single video with shortened url + youtubeLink = youtubeLink.split("youtu.be/")[1].split("&")[0]; + + YouTubeVideo video = new YouTubeVideo(youtubeLink); + _youtubeVideos.add(video); + + if(_addMetadata){ + generateMetadata(video); + } + }else{ // Is Channel + + String pageToken = ""; + String channelId = null; + + if(youtubeLink.contains("/@")){ + channelId = getChannelIdByHandle(youtubeLink.split("@")[1].split("/")[0]); + } + else if(youtubeLink.contains("/channel/")){ + channelId = youtubeLink.split("/channel/")[1].split("/")[0]; + } + + if(channelId != null){ + + do{ + List pagedVideos = new LinkedList<>(); + + JSONObject jsonObject = getChannelVideosByChannelId(channelId, ""); + + JSONArray jsonItems = jsonObject.getJSONArray("items"); + + for (int i = 0; i < jsonItems.length(); i++) { + JSONObject idRequestObject = jsonItems.getJSONObject(i).getJSONObject("id"); + + if(!idRequestObject.has("videoId")) continue; // Found own channel instead of video + + String videoId = idRequestObject.getString("videoId"); + pagedVideos.add(new YouTubeVideo(videoId)); + System.out.println("Added video: " + i); + } + + if(_addMetadata){ + generateBulkMetadata(pagedVideos); + } + + if(jsonObject.has("nextPageToken")) + pageToken = jsonObject.getString("nextPageToken"); + else + pageToken = ""; + + _youtubeVideos.addAll(pagedVideos); + + }while(!pageToken.equals("")); + + } + + } + + if (iRandom > 0) { + _youtubeVideos = random(_youtubeVideos, iRandom); + } + + _youtubeVideosBackup.addAll(_youtubeVideos); + + this.debugCount = debugCount; + + System.out.printf("Found %d files matching the pattern! \t Using Random: %d\n", _youtubeVideos.size(), iRandom); + _initialSize = _youtubeVideos.size(); + _docNumber = new AtomicInteger(0); + _currentMemorySize = new AtomicLong(0); + // 500 MB + _maxMemory = 500 * 1024 * 1024; + + progress = new AdvancedProgressMeter(_initialSize); + } + + private static int getRandomFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE sampleMode, int sampleSize) { + if (sampleMode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.SMALLEST) { + return sampleSize * -1; + } + return sampleSize; + } + + private static boolean getSortFromMode(AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE mode) { + if (mode == AsyncCollectionReader.DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE.RANDOM) { + return false; + } + return true; + } + + public static void addFilesToConcurrentList(File folder, String ending, ConcurrentLinkedQueue paths) { + File[] listOfFiles = folder.listFiles(); + + for (int i = 0; i < listOfFiles.length; i++) { + if (listOfFiles[i].isFile()) { + if (listOfFiles[i].getName().endsWith(ending)) { + paths.add(listOfFiles[i].getPath().toString()); + } + } else if (listOfFiles[i].isDirectory()) { + addFilesToConcurrentList(listOfFiles[i], ending, paths); + } + } + + } + + public static ConcurrentLinkedQueue sortBySize(ConcurrentLinkedQueue paths) { + + ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue(); + + rQueue.addAll(paths.stream().sorted((s1, s2) -> { + Long firstLength = new File(s1).length(); + Long secondLength = new File(s2).length(); + + return firstLength.compareTo(secondLength) * -1; + }).collect(Collectors.toList())); + + return rQueue; + + } + + /** + * Skips files smaller than skipSmallerFiles + * + * @param paths paths to files + * @param skipSmallerFiles skip files smaller than this value in bytes + * @return filtered paths to files + */ + public static ConcurrentLinkedQueue skipBySize(ConcurrentLinkedQueue paths, int skipSmallerFiles) { + ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue<>(); + + System.out.println("Skip files smaller than " + skipSmallerFiles + " bytes"); + System.out.println(" Number of files before skipping: " + paths.size()); + + rQueue.addAll(paths + .stream() + .filter(s -> new File(s).length() >= skipSmallerFiles) + .collect(Collectors.toList()) + ); + + System.out.println(" Number of files after skipping: " + rQueue.size()); + + return rQueue; + } + + public static ConcurrentLinkedQueue random(ConcurrentLinkedQueue videos, int iRandom) { + + ConcurrentLinkedQueue rQueue = new ConcurrentLinkedQueue(); + + Random nRandom = new Random(iRandom); + + ArrayList sList = new ArrayList<>(); + sList.addAll(videos); + + Collections.shuffle(sList, nRandom); + + if (iRandom > sList.size()) { + rQueue.addAll(sList.subList(0, sList.size())); + } else { + rQueue.addAll(sList.subList(0, iRandom)); + } + + + return rQueue; + + } + + @Override + public AdvancedProgressMeter getProgress() { + return this.progress; + } + + @Override + public void getNextCas(JCas empty) { + + YouTubeVideo result = _youtubeVideos.poll(); + + int val = _docNumber.addAndGet(1); + + progress.setDone(val); + progress.setLeft(_initialSize - val); + + if (_initialSize - progress.getCount() > debugCount) { + if (val % debugCount == 0 || val == 0) { + System.out.printf("%s \t %s\n", progress, result.getVideoUrl()); + } + } else { + System.out.printf("%s \t %s\n", progress, result.getVideoUrl()); + } + + try { + empty.reset(); + + JCas ytView; + try { + ytView = empty.getView(_viewName); + + }catch (Exception e){ + ytView = empty.createView(_viewName); + } + + ytView.setSofaDataString(result.getVideoUrl(), "text/x-uri"); + + if(_addMetadata) + setVideoMetadata(result, ytView); + + } catch (Exception e) { + e.printStackTrace(); + } + + if (_addMetadata) { + if (JCasUtil.select(empty, DocumentMetaData.class).size() == 0) { + DocumentMetaData dmd = DocumentMetaData.create(empty); + dmd.setDocumentId(result._id); + dmd.setDocumentTitle(result._title); + //dmd.setDocumentUri(result.getVideoUrl()); + dmd.addToIndexes(); + } + } + + if (_language != null && !_language.isEmpty()) { + empty.setDocumentLanguage(_language); + } + + } + + public void reset() { + _youtubeVideos = _youtubeVideosBackup; + _docNumber.set(0); + progress = new AdvancedProgressMeter(_initialSize); + } + + @Override + public boolean hasNext() { + return _youtubeVideos.size() > 0; + } + + @Override + public long getSize() { + return _youtubeVideos.size(); + } + + @Override + public long getDone() { + return _docNumber.get(); + } + + public String formatSize(long lSize) { + + int u = 0; + for (; lSize > 1024 * 1024; lSize >>= 10) { + u++; + } + if (lSize > 1024) + u++; + return String.format("%.1f %cB", lSize / 1024f, " kMGTPE".charAt(u)); + + } + + public enum DUUI_ASYNC_COLLECTION_READER_SAMPLE_MODE { + RANDOM, + SMALLEST, + LARGEST + } + + private JSONObject getPlaylistVideos(String playlistId, String pageToken) throws IOException, InterruptedException { + String url = "https://youtube.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=" + playlistId + "&key=" + _apiKey + "&maxResults=50&pageToken=" + pageToken; + HttpClient client = HttpClient.newHttpClient(); + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(url)) + .build(); + + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + return new JSONObject(response.body().toString()); + } + + private String getChannelIdByHandle(String channelHandle) throws IOException, InterruptedException { + String url = "https://youtube.googleapis.com/youtube/v3/search?part=snippet&maxResults=1&q=" + channelHandle + "&type=channel&key=" + _apiKey; + HttpClient client = HttpClient.newHttpClient(); + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(url)) + .build(); + + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + JSONObject jsonObject = new JSONObject(response.body().toString()); + JSONArray resultArray = jsonObject.getJSONArray("items"); + + if(resultArray.length() == 0) return null; + + return resultArray.getJSONObject(0).getJSONObject("id").getString("channelId"); + } + + private JSONObject getChannelVideosByChannelId(String channelId, String pageToken) throws IOException, InterruptedException { + String url = "https://www.googleapis.com/youtube/v3/search?key=" + _apiKey + "&channelId=" + channelId +"&part=id&order=date&maxResults=50&pageToken=" + pageToken; + + HttpClient client = HttpClient.newHttpClient(); + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(url)) + .build(); + + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + return new JSONObject(response.body().toString()); + } + + private void setVideoMetadata(YouTubeVideo video, JCas jCas) throws IOException, InterruptedException{ + + YouTube youTube = new YouTube(jCas); + + if(_videosPlaylists.containsKey(video.getVideoId())){ + List playlistIds = _videosPlaylists.get(video.getVideoId()); + Playlist[] playlists = new Playlist[_videosPlaylists.get(video.getVideoId()).size()]; + + for (int i = 0; i < playlistIds.size(); i++){ + Playlist playlist = new Playlist(jCas); + + String playlistUrl = "https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=" + playlistIds.get(i) + "&key=" + _apiKey; + + HttpClient client = HttpClient.newHttpClient(); + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(playlistUrl)) + .build(); + + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + + JSONObject playlistJsonObject = new JSONObject(response.body().toString()); + JSONObject playlistItem = playlistJsonObject.getJSONArray("items").getJSONObject(0); + JSONObject playlistSnippet = playlistItem.getJSONObject("snippet"); + + playlist.setName(playlistSnippet.getString("title")); + playlist.setCreateDate(youtubeDateToInt(playlistSnippet.getString("publishedAt"))); + playlist.setUrl("https://www.youtube.com/watch?v=" + video.getVideoId() + "&list=" + playlistIds.get(i)); + + playlists[i] = playlist; + } + + FSList list = FSList.create(jCas, playlists); + list.addToIndexes(); + } + + youTube.setUrl(video.getVideoUrl()); + youTube.setChannelName(video._channelName); + youTube.setChannelURL(video._channelUrl); + youTube.setLength(video._duration); + youTube.setViews(video._views); + youTube.setLikes(video._likes); + youTube.setDislikes(0); // Does not support dislikes + youTube.setCreateDate(video._createDate); + + youTube.setDownloadDate(currentDateToInt()); + youTube.addToIndexes(); + } + + private void generateMetadata(YouTubeVideo video) throws IOException, InterruptedException { + List videos = new LinkedList<>(); + videos.add(video); + generateBulkMetadata(videos); + } + + private void generateBulkMetadata(List videos) throws IOException, InterruptedException { + if(videos.isEmpty()) return; + + String ids = ""; + + for(YouTubeVideo video : videos){ + if(ids.equals("")){ + ids = video.getVideoId(); + }else{ + ids += "," + video.getVideoId(); + } + } + + + String url = "https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2Cstatistics%2CcontentDetails&id=" + ids + "&key=" + _apiKey; + HttpClient client = HttpClient.newHttpClient(); + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(url)) + .build(); + + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + + JSONObject jsonObject = new JSONObject(response.body().toString()); + + JSONArray items = jsonObject.getJSONArray("items"); + + for(int i = 0; i < items.length(); i++){ + JSONObject snippet = items.getJSONObject(i).getJSONObject("snippet"); + JSONObject statistics = items.getJSONObject(i).getJSONObject("statistics"); + JSONObject contentDetails = items.getJSONObject(i).getJSONObject("contentDetails"); + + videos.get(i).setMetadata(snippet, statistics, contentDetails); + } + } + + + private int youtubeDateToInt(String youtubeDate){ + String[] dateElements = youtubeDate.split("T")[0].split("-"); // Seperate date and time + + if(dateElements[0].length() == 1) + dateElements[0] = "0" + dateElements[0]; + + if(dateElements[1].length() == 1) + dateElements[1] = "0" + dateElements[1]; + + int iCreateDate = Integer.parseInt(dateElements[2] + dateElements[1] + dateElements[0]); + return iCreateDate; + } + + private int currentDateToInt(){ + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("ddMMyyyy"); + return Integer.parseInt(ZonedDateTime.now().format(formatter)); + } + + private String readFile(File file) throws FileNotFoundException { + String result = ""; + Scanner myReader = new Scanner(file); + while (myReader.hasNextLine()) { + if(result == ""){ + result = myReader.nextLine(); + }else{ + result += "\n" + myReader.nextLine(); + } + } + + return result; + } + + class YouTubeVideo{ + private String _id; + private String _channelName; + private String _channelUrl; + private String _title; + private int _duration; + private int _views; + private int _likes; + private int _createDate; + + public YouTubeVideo(String id){ + _id = id; + } + + public String getVideoId(){ + return _id; + } + + public String getVideoUrl(){ + return "https://www.youtube.com/watch?v=" + _id; + } + + public void setMetadata(JSONObject snippet, JSONObject statistics, JSONObject contentDetails){ + + _title = snippet.getString("title"); + _channelName = snippet.getString("channelTitle"); + _channelUrl = "https://www.youtube.com/channel/" + snippet.getString("channelId"); + String sDuration = contentDetails.getString("duration").substring(2); + int iDuration = 0; + + if(sDuration.contains("H")){ + String[] hours = sDuration.split("H"); + String[] minutes = hours[1].split("M"); + String seconds = minutes[1].split("S")[0]; + + iDuration = Integer.parseInt(hours[0]) * 360 + Integer.parseInt(minutes[0]) * 60 + Integer.parseInt(seconds); + }else if(sDuration.contains("M")){ + String[] minutes = sDuration.split("M"); + String seconds = minutes[1].split("S")[0]; + + iDuration = Integer.parseInt(minutes[0]) * 60 + Integer.parseInt(seconds); + }else if(sDuration.contains("S")){ + String seconds = sDuration.split("S")[0]; + + iDuration = Integer.parseInt(seconds); + } + + _duration = iDuration; + _views = Integer.parseInt(statistics.getString("viewCount")); + _likes = Integer.parseInt(statistics.getString("likeCount")); + + _createDate = youtubeDateToInt(snippet.getString("publishedAt")); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/writer/AudioSegmentWriter.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/writer/AudioSegmentWriter.java new file mode 100644 index 00000000..5674f33d --- /dev/null +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/io/writer/AudioSegmentWriter.java @@ -0,0 +1,69 @@ +package org.texttechnologylab.DockerUnifiedUIMAInterface.io.writer; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import org.apache.commons.io.FileUtils; +import org.apache.uima.cas.CASException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasFileWriter_ImplBase; +import org.texttechnologylab.DockerUnifiedUIMAInterface.tools.MultimodalUtil; +import org.texttechnologylab.annotation.type.AudioToken; + +import java.io.File; +import java.io.IOException; + +public class AudioSegmentWriter extends JCasFileWriter_ImplBase { + + public static final String PARAM_AUDIO_TOKEN_VIEW = "audioTokenView"; + @ConfigurationParameter(name = PARAM_AUDIO_TOKEN_VIEW, defaultValue = "_InitialView") + private String audioTokenView; + + public static final String PARAM_AUDIO_CONTENT_VIEW = "audioView"; + @ConfigurationParameter(name = PARAM_AUDIO_CONTENT_VIEW, defaultValue = "_InitialView") + private String audioView; + + @Override + public void process(JCas jCas) { + + + try { + DocumentMetaData meta = null; + if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) { + meta = DocumentMetaData.get(jCas); + } + + DocumentMetaData finalMeta = meta; + + JCas audioFileView = jCas.getView(audioView); + + MultimodalUtil.getAllCoveredAudio(jCas.getView(audioTokenView), audioFileView, AudioToken.class, "wav").forEach(file -> { + + String moveTo = getTargetLocation(); + + if(!moveTo.endsWith("/") && !moveTo.endsWith("\\")){ + moveTo = moveTo + "/"; + } + + String documentName; + + if(finalMeta != null && finalMeta.getDocumentId() != null){ + documentName = finalMeta.getDocumentId() + "_"; + }else{ + documentName = "File_"; + } + + try { + FileUtils.moveFile(new File(file.getAbsolutePath()), new File(moveTo + documentName + file.getName())); + } catch (IOException e) { + e.printStackTrace(); + } + } + + ); + } catch (CASException e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java index fcb99e1f..fb1777de 100644 --- a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/lua/DUUILuaCommunicationLayer.java @@ -1,6 +1,7 @@ package org.texttechnologylab.DockerUnifiedUIMAInterface.lua; import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.cas.CASException; import org.apache.uima.jcas.JCas; import org.luaj.vm2.LuaTable; import org.luaj.vm2.lib.jse.CoerceJavaToLua; @@ -32,18 +33,37 @@ public DUUILuaCommunicationLayer(String script, String origin, DUUILuaContext gl _globalContext = globalContext; } - public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters) throws CompressorException, IOException, SAXException { + public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters) throws CompressorException, IOException, SAXException, CASException { + serialize(jc, out, parameters, "_InitialView"); + } + + public void serialize(JCas jc, ByteArrayOutputStream out, Map parameters, String sourceView) throws CompressorException, IOException, SAXException, CASException { LuaTable params = new LuaTable(); if (parameters != null) { for (String key : parameters.keySet()) { params.set(key, parameters.get(key)); } } - _file.call("serialize",CoerceJavaToLua.coerce(jc),CoerceJavaToLua.coerce(out), params); + + _file.call("serialize",CoerceJavaToLua.coerce(jc.getView(sourceView)),CoerceJavaToLua.coerce(out), params); + } + + public void deserialize(JCas jc, ByteArrayInputStream input) throws IOException, SAXException, CASException { + deserialize(jc, input, "_InitialView"); } - public void deserialize(JCas jc, ByteArrayInputStream input) throws IOException, SAXException { - _file.call("deserialize",CoerceJavaToLua.coerce(jc),CoerceJavaToLua.coerce(input)); + + public void deserialize(JCas jc, ByteArrayInputStream input, String targetView) throws IOException, SAXException, CASException { + + JCas tJc; + + try{ + tJc = jc.getView(targetView); + }catch (Exception e){ + tJc = jc.createView(targetView); + } + + _file.call("deserialize",CoerceJavaToLua.coerce(tJc),CoerceJavaToLua.coerce(input)); } diff --git a/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/MultimodalUtil.java b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/MultimodalUtil.java new file mode 100644 index 00000000..bc34867e --- /dev/null +++ b/src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/MultimodalUtil.java @@ -0,0 +1,560 @@ +package org.texttechnologylab.DockerUnifiedUIMAInterface.tools; + +import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; +import org.apache.commons.codec.binary.Base64; +import org.apache.uima.cas.CASException; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.texttechnologylab.annotation.AnnotationComment; +import org.texttechnologylab.annotation.type.AudioToken; +import org.texttechnologylab.annotation.type.Coordinate; +import org.texttechnologylab.annotation.type.SubImage; + +import javax.imageio.ImageIO; +import java.awt.*; +import java.awt.geom.Path2D; +import java.awt.image.BufferedImage; +import java.awt.image.RenderedImage; +import java.io.*; +import java.nio.file.Files; +import java.util.*; +import java.util.List; + +public class MultimodalUtil { + + /** + * Converts each AudioTokens into its own audio snippet. + * @param audioTokenView The view in which the audio tokens are stored + * @param annotationClass TThe annotation the covered elements are derived from + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static List getAllCoveredAudio(JCas audioTokenView, Class annotationClass) throws CASException { + return getAllCoveredAudio(audioTokenView, null, annotationClass, "wav"); + } + + /** + * Converts each AudioTokens into its own audio snippet. + * @param audioTokenView The view in which the audio tokens are stored + * @param annotationClass The annotation the covered elements are derived from + * @param targetFormat File format for the output file. (like "wav" or "mp3") + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static List getAllCoveredAudio(JCas audioTokenView, Class annotationClass, String targetFormat) throws CASException { + return getAllCoveredAudio(audioTokenView, null, annotationClass, targetFormat); + } + + /** + * Converts each AudioTokens into its own audio snippet. + * @param audioTokenView The view in which the audio tokens are stored + * @param annotationClass The annotation the covered elements are derived from + * @param audioFileView The view containing the entire audio file in its sofa string (If null, tries to auto-detect) + * @param targetFormat File format for the output file. (like "wav" or "mp3") + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static List getAllCoveredAudio(JCas audioTokenView, JCas audioFileView, Class annotationClass, String targetFormat) throws CASException { + + List files = new ArrayList<>(); + List commands = new LinkedList<>(); + + JCasUtil.select(audioTokenView, annotationClass).forEach(annotation -> { + + float startTime = Integer.MAX_VALUE; + float endTime = 0; + + List tokens = JCasUtil.selectOverlapping(AudioToken.class, annotation).stream().toList(); + + for(AudioToken token : tokens){ + System.out.println(token.getTimeStart() + " " + token.getTimeEnd()); + if(token.getTimeStart() < startTime) + startTime = token.getTimeStart(); + + if(token.getTimeEnd() > endTime) + endTime = token.getTimeEnd(); + } + + if(startTime == Integer.MAX_VALUE) { + return; + } + + commands.add(String.format("-ss %s -t %s %s", + startTime, + endTime - startTime, + getOutputName(audioTokenView, annotation, targetFormat))); + + + File file = new File(getOutputName(audioTokenView, annotation, targetFormat)); + file.deleteOnExit(); + files.add(file); + }); + + MultimodalUtil.getEveryAudioSegment(audioTokenView, audioFileView, commands); + + return files; + } + + /** + * Converts a AudioToken into its own audio snippet + * @param audioTokenView The view in which the audio tokens are stored + * @param audioToken The audio token class (like AudioToken) + * @return A file containing the audio segment + * @throws CASException + */ + public static File getCoveredAudio(JCas audioTokenView, AudioToken audioToken) throws CASException { + return getCoveredAudio(audioTokenView, null, audioToken, "wav"); + } + + /** + * Converts a AudioToken into its own audio snippet + * @param audioTokenView The view in which the audio tokens are stored + * @param audioToken The audio token class (like AudioToken) + * @param targetFormat File format for the output file. (like "wav" or "mp3") + * @return A file containing the audio segment + * @throws CASException + */ + public static File getCoveredAudio(JCas audioTokenView, AudioToken audioToken, String targetFormat) throws CASException { + return getCoveredAudio(audioTokenView, null, audioToken, targetFormat); + } + + /** + * Converts a AudioToken into its own audio snippet + * @param audioTokenView The view in which the audio tokens are stored + * @param audioToken The audio token class (like AudioToken) + * @param audioFileView The view containing the entire audio file in its sofa string (If null, tries to auto-detect) + * @param targetFormat File format for the output file. (like "wav" or "mp3") + * @return A file containing the audio segment + * @throws CASException + */ + public static File getCoveredAudio(JCas audioTokenView, JCas audioFileView, AudioToken audioToken, String targetFormat) throws CASException { + + if(audioFileView == null) + audioFileView = findAudioView(audioTokenView); + + String inputFileName = "temp_" + audioFileView.getViewName(); + String outputFileName = getOutputName(audioTokenView, audioToken, targetFormat); + + if(!new File(inputFileName).exists()) { + // Convert encoded string to file + OutputStream stream = null; + try { + stream = new FileOutputStream(inputFileName); + stream.write(Base64.decodeBase64(audioFileView.getSofaDataString())); + } catch (IOException e) { + e.printStackTrace(); + } + + if (stream != null) { + try { + stream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + File inputFile = new File(inputFileName); + inputFile.deleteOnExit(); + + executeFFMpeg(inputFile.getAbsolutePath(), outputFileName, audioToken.getTimeStart(), audioToken.getTimeEnd()); + + inputFile.delete(); + + File outputFile = new File(outputFileName); + outputFile.deleteOnExit(); + + return outputFile; + } + + private static void getEveryAudioSegment(JCas audioTokenCas, JCas audioFileView, List commands) throws CASException { + + if(audioFileView == null) + audioFileView = findAudioView(audioTokenCas); + + String inputFileName = "temp_" + audioFileView.getViewName(); + + if(!new File(inputFileName).exists()) { + // Convert encoded string to file + OutputStream stream = null; + try { + stream = new FileOutputStream(inputFileName); + stream.write(Base64.decodeBase64(audioFileView.getSofaDataString())); + } catch (IOException e) { + e.printStackTrace(); + } + + if (stream != null) { + try { + stream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + } + + File inputFile = new File(inputFileName); + inputFile.deleteOnExit(); + + executeFFMpeg(inputFile.getAbsolutePath(), commands); + inputFile.delete(); + } + + /** + * Tries to find the view which contains the entire audio file in its sofa string + * @param fromJCas JCas to search the views in + * @throws CASException + */ + public static JCas findAudioView(JCas fromJCas) throws CASException { + Iterator iter = fromJCas.getViewIterator(); + + while(iter.hasNext()){ + JCas view = iter.next(); + + if(view.getSofaMimeType().startsWith("audio/")){ + return view; + } + } + + return fromJCas; + } + + + /** + * Converts each AudioToken into its own video snippet. + * @param audioTokenView The view in which the audio tokens are stored + * @param annotationClass The annotation the covered elements are derived from + * @return List of files, each file containing the video content. + * @throws CASException + */ + public static List getAllCoveredVideo(JCas audioTokenView, Class annotationClass) throws CASException { + return getAllCoveredVideo(null, audioTokenView, annotationClass); + } + + /** + * Converts each AudioToken into its own audio snippet. + * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect) + * @param audioTokenView The view in which the audio tokens are stored + * @param annotationClass The annotation the covered elements are derived from + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static List getAllCoveredVideo(JCas audioTokenView, JCas videoFileView, Class annotationClass) throws CASException { + return getAllCoveredVideo(videoFileView, audioTokenView, annotationClass, "mp4"); + } + + /** + * Converts each AudioToken into its own audio snippet. + * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect) + * @param audioTokenView The view in which the audio tokens are stored + * @param annotationClass The annotation the covered elements are derived from + * @param targetFormat File format for the output file. (like "mp4" or "webm") + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static List getAllCoveredVideo(JCas audioTokenView, JCas videoFileView, Class annotationClass, String targetFormat) throws CASException { + + if(videoFileView == null){ + videoFileView = findVideoView(audioTokenView); + } + + List files = new ArrayList<>(); + List commands = new LinkedList<>(); + + JCasUtil.select(audioTokenView, annotationClass).forEach(annotation -> { + + float startTime = Integer.MAX_VALUE; + float endTime = 0; + + List tokens = JCasUtil.selectOverlapping(AudioToken.class, annotation).stream().toList(); + + for(AudioToken token : tokens){ + if(token.getTimeStart() < startTime) + startTime = token.getTimeStart(); + + if(token.getTimeEnd() > endTime) + endTime = token.getTimeEnd(); + } + + if(startTime == Integer.MAX_VALUE) + return; +/* + System.out.println("============================"); + System.out.println(getOutputName(audioTokenView, annotation, targetFormat)); + System.out.println(startTime + " " + endTime); + */ + + commands.add(String.format("-ss %s -t %s %s", + startTime, + endTime - startTime, + getOutputName(audioTokenView, annotation, targetFormat))); + + + File file = new File(getOutputName(audioTokenView, annotation, targetFormat)); + file.deleteOnExit(); + files.add(file); + }); + + MultimodalUtil.getEveryVideoSegment(videoFileView, commands); + + return files; + } + + + /** + * Converts each AudioTokens into its own audio snippet. + * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect) + * @param audioToken The audio token class (like AudioToken) + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static File getCoveredVideo(JCas videoFileView, AudioToken audioToken){ + return getCoveredVideo(videoFileView, audioToken, "mp4"); + } + + /** + * Converts each AudioTokens into its own audio snippet. + * @param videoFileView The view containing the entire video file in its sofa string (If null, tries to auto-detect) + * @param audioToken The audio token class (like AudioToken) + * @param targetFormat File format for the output file. (like "mp4" or "webm") + * @return List of files, each file containing the audio content. + * @throws CASException + */ + public static File getCoveredVideo(JCas videoFileView, AudioToken audioToken, String targetFormat){ + + String inputFileName = "temp_" + videoFileView.getViewName(); + String outputFileName = getOutputName(videoFileView, audioToken, targetFormat); + + if(!new File("temp_" + videoFileView.getViewName()).exists()) { + // Convert encoded string to file + OutputStream stream = null; + try { + stream = new FileOutputStream(inputFileName); + stream.write(Base64.decodeBase64(videoFileView.getSofaDataString())); + } catch (IOException e) { + e.printStackTrace(); + } + + if (stream != null) { + try { + stream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + File inputFile = new File(inputFileName); + inputFile.deleteOnExit(); + + executeFFMpeg(inputFile.getAbsolutePath(), outputFileName, audioToken.getTimeStart(), audioToken.getTimeEnd()); + + File outputFile = new File(outputFileName); + outputFile.deleteOnExit(); + + return outputFile; + } + + private static void getEveryVideoSegment(JCas videoViewCas, List commands){ + + String inputFileName = "temp_" + videoViewCas.getViewName(); + + if(!new File("temp_" + videoViewCas.getViewName()).exists()) { + // Convert encoded string to file + OutputStream stream = null; + try { + stream = new FileOutputStream(inputFileName); + stream.write(Base64.decodeBase64(videoViewCas.getSofaDataString())); + } catch (IOException e) { + e.printStackTrace(); + } + + if (stream != null) { + try { + stream.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + File inputFile = new File(inputFileName); + inputFile.deleteOnExit(); + + executeFFMpeg(inputFile.getAbsolutePath(), commands); + } + + /** + * Tries to find the view which contains the entire video file in its sofa string + * @param fromJCas JCas to search the views in + * @throws CASException + */ + public static JCas findVideoView(JCas fromJCas) throws CASException { + Iterator iter = fromJCas.getViewIterator(); + + while(iter.hasNext()){ + JCas view = iter.next(); + + if(view.getSofaMimeType().startsWith("video/")){ + return view; + } + } + + return fromJCas; + } + + private static void executeFFMpeg(String absoluteInputPath, String output, float startTime, float endTime){ + try { + // -ss: seeking (skipping forward x seconds) + // -t: duration + ProcessBuilder pb = new ProcessBuilder("ffmpeg", "-ss,", Float.toString(startTime), "-t", Float.toString(endTime - startTime), "-i", absoluteInputPath, output); + + pb.redirectOutput(ProcessBuilder.Redirect.INHERIT); + pb.redirectError(ProcessBuilder.Redirect.INHERIT); + Process p = pb.start(); + p.waitFor(); + + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + private static void executeFFMpeg(String absoluteInputPath, List commands){ + try { + // -ss: seeking (skipping forward x seconds) + // -t: duration + ProcessBuilder pb = new ProcessBuilder("ffmpeg", "-i", absoluteInputPath); + + for(String outputCommand : commands){ + pb.command().addAll(Arrays.stream(outputCommand.split(" ")).toList()); + } + + pb.redirectOutput(ProcessBuilder.Redirect.INHERIT); + pb.redirectError(ProcessBuilder.Redirect.INHERIT); + Process p = pb.start(); + p.waitFor(); + + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + public static List getSubImages(JCas jCas){ + return getSubImages(jCas, null); + } + + public static List getSubImages(JCas jCas, String overrideExtension) { + + List subImages = new ArrayList<>(); + + JCasUtil.select(jCas, SubImage.class).forEach(subImage -> { + byte[] base64Image = Base64.decodeBase64(subImage.getParent().getSrc()); + try { + BufferedImage bImage = ImageIO.read(new ByteArrayInputStream(base64Image)); + + Polygon polygon = new Polygon(); + + for(int i = 0; i < subImage.getCoordinates().size(); i++){ + + Coordinate coordniate = subImage.getCoordinates().get(i); + polygon.addPoint(coordniate.getX(), coordniate.getY()); + } + + Rectangle bounds = polygon.getBounds(); + + BufferedImage bSubImage = new BufferedImage(bounds.width, bounds.height, BufferedImage.TYPE_INT_RGB); + + Graphics2D graphics = bSubImage.createGraphics(); + + polygon.translate(-bounds.x, -bounds.y); + + graphics.setClip(polygon); + graphics.drawImage(bImage, -bounds.x, -bounds.y, null); + + // Create file + + File tempInputFile = new File("tempInputImage"); + tempInputFile.deleteOnExit(); + try (OutputStream stream = new FileOutputStream(tempInputFile)) { + stream.write(base64Image); + } + + String mimeType = Files.probeContentType(tempInputFile.toPath()); + String subType = ""; + + if(overrideExtension == null){ + if(mimeType != null && !mimeType.isEmpty()){ + subType = mimeType.split("/")[1]; + }else{ + subType = "jpg"; + } + }else{ + subType = overrideExtension; + if(subType.startsWith(".")){ + if(subType.length() > 1) + subType = subType.substring(1); + } + } + + File outputFile = new File(getOutputName(jCas, subImage, subType)); + outputFile.deleteOnExit(); + + RenderedImage rendImage = bSubImage; + ImageIO.write(rendImage, subType, outputFile); + + subImages.add(outputFile); + } catch (IOException e) { + e.printStackTrace(); + } + }); + + return subImages; + } + + private static String getOutputName(JCas jCas, AudioToken audioToken, String format){ + if(format.startsWith(".")){ + format = format.substring(1); + } + + String documentId = ""; + if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) { + DocumentMetaData meta = DocumentMetaData.get(jCas); + documentId = meta.getDocumentId() + "_"; + } + + System.out.println("OUTPUT FILE: " + documentId + audioToken._id() + "_" + audioToken.getTimeStart() + "-" + audioToken.getTimeEnd() + "." + format); + return documentId + audioToken._id() + "_" + audioToken.getTimeStart() + "-" + audioToken.getTimeEnd() + "." + format; + } + + private static String getOutputName(JCas jCas, Annotation annotation, String format){ + if(format.startsWith(".")){ + format = format.substring(1); + } + + String documentId = ""; + if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) { + DocumentMetaData meta = DocumentMetaData.get(jCas); + documentId = meta.getDocumentId() + "_"; + } + + return documentId + annotation._id() + "_" + annotation.getBegin()+ "-" + annotation.getEnd() + "." + format; + } + + private static String getOutputName(JCas jCas, SubImage subImage, String format){ + + String documentId = ""; + if (JCasUtil.select(jCas, DocumentMetaData.class).size() > 0) { + DocumentMetaData meta = DocumentMetaData.get(jCas); + documentId = meta.getDocumentId() + "_"; + } + + return documentId + subImage.getParent()._id() + "_" + subImage._id() + "." + format; + } + +} diff --git a/src/test/java/TestDUUI.java b/src/test/java/TestDUUI.java index bef9a138..e8076dcb 100644 --- a/src/test/java/TestDUUI.java +++ b/src/test/java/TestDUUI.java @@ -1,11 +1,11 @@ import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.cas.SerialFormat; import org.apache.uima.cas.impl.XmiCasSerializer; -import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; @@ -30,22 +30,30 @@ import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIPipelineDescription; import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.*; import org.texttechnologylab.DockerUnifiedUIMAInterface.io.AsyncCollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUIAsynchronousProcessor; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.DUUICollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.DUUIMultimodalCollectionReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.reader.DUUIYouTubeReader; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.writer.AudioSegmentWriter; import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaCommunicationLayer; import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaSandbox; import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.LuaConsts; import org.texttechnologylab.DockerUnifiedUIMAInterface.pipeline_storage.DUUIMockStorageBackend; import org.texttechnologylab.DockerUnifiedUIMAInterface.pipeline_storage.sqlite.DUUISqliteStorageBackend; +import org.texttechnologylab.DockerUnifiedUIMAInterface.tools.MultimodalUtil; +import org.texttechnologylab.annotation.type.*; import org.xml.sax.SAXException; import javax.script.*; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.net.URISyntaxException; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.HashSet; +import java.util.Set; import java.util.stream.Collectors; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; @@ -821,4 +829,370 @@ public void kubernetesTest() throws Exception { composer.run(pCorpusReader, "sentence"); } + @Test + public void differentViewsTest() throws Exception{ + + ClassLoader classLoader = getClass().getClassLoader(); + URL resource = classLoader.getResource("hf_key.txt"); + + File keyFile = new File(resource.toURI());; + + String content = ""; + try { + content += Files.readAllLines(keyFile.toPath(), StandardCharsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + + String hfKey = content.substring(1, content.length() - 1); + + JCas aCas = JCasFactory.createJCas(); + + File videoFile = new File("D:/DUUIVideos/read/TBBT.mp4"); + if (videoFile.exists()) { + String encoded = org.apache.commons.codec.binary.Base64.encodeBase64String(org.apache.commons.io.FileUtils.readFileToByteArray(videoFile)); + String mimeType = Files.probeContentType(videoFile.toPath()); + System.out.println(mimeType); + aCas.setSofaDataString(encoded, mimeType); + //aCas.setSofaDataString("https://www.youtube.com/watch?v=dEMzzgbm6Ow", mimeType); + }else{ + System.out.println(videoFile.getAbsolutePath() + " not found"); + return; + } + + //aCas.setSofaDataString("https://www.youtube.com/watch?v=8qZYsYq_Ctw", "text/x-uri"); + + int iWorkers = 1; + + DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary(); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =) + .withLuaContext(ctx) // wir setzen den definierten Kontext + .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit. + + DUUIUIMADriver uima_driver = new DUUIUIMADriver(); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + + // Hinzufügen der einzelnen Driver zum Composer + composer.addDriver(uima_driver, remoteDriver, dockerDriver); + + aCas.setDocumentLanguage("de"); + + /* + composer.add(new DUUIDockerDriver.Component("duui-yt-dlp:latest") // YT Downloader + .withScale(iWorkers) + .withTargetView("video_view") + .withParameter("withTranscription", "true") + .build()); + + composer.add(new DUUIDockerDriver.Component("duui-video-to-audio:latest") // Video to audio + .withScale(iWorkers) + .withSourceView("video_view") + .withTargetView("audio_view") + .build()); + + composer.add(new DUUIDockerDriver.Component("duui-whisper:latest") // Audio to text + .withScale(iWorkers) + .withSourceView("audio_view") + .withTargetView("text_view") + .build()); + + composer.add(new DUUIDockerDriver.Component("duui-pyannote:latest") // Audio to speaker + .withScale(iWorkers) + .withSourceView("audio_view") + .withParameter("token", hfKey) + .withTargetView("text_view") + .build()); */ + + composer.add(new DUUIDockerDriver.Component("duui-annotheia:latest") // Annotheia + .withScale(iWorkers) + .withTargetView("text_view") + .withName("annotheia") + //.withRunningAfterDestroy(true) + //.withParameter("device", "cuda") + .build()); + + composer.add(new DUUIDockerDriver.Component("duui-spacy:latest") // Spacy + .withScale(iWorkers) + .withView("text_view") + .withParameter("use_existing_sentences", "false") + .withParameter("use_existing_tokens", "false") + .build()); + + /*composer.add(new DUUIRemoteDriver.Component("http://localhost:9717") // Audio to speaker + .withScale(iWorkers) + .withSourceView("audio_view") + .withTargetView("text_view") + .withParameter("token", hfKey) + .withParameter("device", "cuda") + .build()); + */ + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp", + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP")) + .build()); + + composer.run(aCas); + + MultimodalUtil.getAllCoveredVideo(aCas.getView("text_view"), aCas, Sentence.class, "mp4").forEach(file -> { + try { + FileUtils.moveFile(new File(file.getAbsolutePath()), new File("C:/test/" + file.getName())); + } catch (IOException e) { + e.printStackTrace(); + } + } + ); + } + + @Test + public void youtubeReaderTest() throws Exception{ + + ClassLoader classLoader = getClass().getClassLoader(); + URL resource = classLoader.getResource("hf_key.txt"); + + File file = new File(resource.toURI());; + + String content = ""; + try { + content += Files.readAllLines(file.toPath(), StandardCharsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } + + String hfKey = content.substring(1, content.length() - 1); + + //CasIOUtils.save(aCas.getCas(), new FileOutputStream(new File("/tmp/audiotest.xmi")), SerialFormat.XMI_1_1); + int iWorkers = 1; + + DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary(); + + // Instanziierung des Composers, mit einigen Parametern + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =) + .withLuaContext(ctx) // wir setzen den definierten Kontext + .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit. + + DUUIUIMADriver uima_driver = new DUUIUIMADriver(); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + + + //DUUIYouTubeReader ytReader = new DUUIYouTubeReader("https://www.youtube.com/@Jules1/videos", "AIzaSyDycLCdJ1_jfkFL-pWnQuf1FzluJbX21Bw"); + //DUUIYouTubeReader ytReader = new DUUIYouTubeReader("https://www.youtube.com/watch?v=SV6NJ6PcGBs&list=PLh19WWr20745LHdlDAg2P_JT7I2Wx6axP", "AIzaSyDycLCdJ1_jfkFL-pWnQuf1FzluJbX21Bw"); + DUUIMultimodalCollectionReader multiReader = new DUUIMultimodalCollectionReader("D:/DUUIVideos/read", "mp4"); + + Set readers = new HashSet<>(); + + //readers.add(ytReader); + readers.add(multiReader); + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(readers); + + // Hinzufügen der einzelnen Driver zum Composer + composer.addDriver(uima_driver, remoteDriver); + + /*composer.add(new DUUIRemoteDriver.Component("http://localhost:9713") // Youtube downloader + .withScale(iWorkers) + .withTargetView("video_view") + .withParameter("withTranscription", "false") + .build()); + + composer.add(new DUUIRemoteDriver.Component("http://localhost:9714") // Video to audio + .withScale(iWorkers) + .withSourceView("video_view") + .withTargetView("audio_view") + .build()); + + composer.add(new DUUIRemoteDriver.Component("http://localhost:9715") // Audio to text + .withScale(iWorkers) + .withSourceView("audio_view") + .withTargetView("text_view") + .withParameter("device", "cuda") + .build());*/ + + composer.add(new DUUIRemoteDriver.Component("http://localhost:9717") // Annotheia + .withScale(iWorkers) + .withTargetView("text_view") + //.withParameter("token", hfKey) + .withParameter("device", "cuda") + .build()); + + /*composer.add(new DUUIRemoteDriver.Component("http://localhost:9720") // Spacy + .withScale(iWorkers) + .withSourceView("text_view") + .withTargetView("text_view") + .withParameter("use_existing_sentences", "false") + .withParameter("use_existing_tokens", "false") + .build());*/ + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp", + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP")) + .build()); + + /*composer.add(new DUUIUIMADriver.Component(createEngineDescription(AudioSegmentWriter.class, + AudioSegmentWriter.PARAM_TARGET_LOCATION, "C:/test", + AudioSegmentWriter.PARAM_AUDIO_CONTENT_VIEW, "audio_view", + AudioSegmentWriter.PARAM_AUDIO_TOKEN_VIEW, "text_view")) + .build()); */ + + + composer.run(processor, "test"); + + + //composer.run(aCas); + } + + @Test + public void multimodalFileReaderTest() throws Exception{ + + + int iWorkers = 1; + + DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary(); + + // Instanziierung des Composers, mit einigen Parametern + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =) + .withLuaContext(ctx) // wir setzen den definierten Kontext + .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit. + + DUUIUIMADriver uima_driver = new DUUIUIMADriver(); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + + + DUUIMultimodalCollectionReader multiReader = new DUUIMultimodalCollectionReader("D:/DUUIVideos/read", "gz"); + + Set readers = new HashSet<>(); + + readers.add(multiReader); + + DUUIAsynchronousProcessor processor = new DUUIAsynchronousProcessor(readers); + + // Hinzufügen der einzelnen Driver zum Composer + composer.addDriver(uima_driver, remoteDriver, dockerDriver); + /* + composer.add(new DUUIRemoteDriver.Component("http://localhost:9713") // Youtube downloader + .withScale(iWorkers) + .withTargetView("video_view") + .withParameter("withTranscription", "false") + .build()); + + composer.add(new DUUIRemoteDriver.Component("http://localhost:9714") // Video to audio + .withScale(iWorkers) + .withSourceView("video_view") + .withTargetView("audio_view") + .build()); + + composer.add(new DUUIRemoteDriver.Component("http://localhost:9715") // Audio to text + .withScale(iWorkers) + .withSourceView("audio_view") + .withTargetView("text_view") + .withParameter("device", "cuda") + .build()); + + composer.add(new DUUIRemoteDriver.Component("http://localhost:9717") // Audio to speaker + .withScale(iWorkers) + .withSourceView("audio_view") + .withTargetView("text_view") + .withParameter("token", hfKey) + .withParameter("device", "cuda") + .build());*/ + + /* composer.add(new DUUIDockerDriver.Component("duui-spacy:latest") // Spacy + .withScale(iWorkers) + .withView("text_view") + .withRunningAfterDestroy(true) + .withParameter("use_existing_sentences", "false") + .withParameter("use_existing_tokens", "false") + .build()); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp", + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP")) + .build());*/ + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(AudioSegmentWriter.class, + AudioSegmentWriter.PARAM_TARGET_LOCATION, "C:/test", + //AudioSegmentWriter.PARAM_AUDIO_CONTENT_VIEW, "text_view", + AudioSegmentWriter.PARAM_AUDIO_TOKEN_VIEW, "text_view")) + .build()); + + + composer.run(processor, "test"); + + //composer.run(aCas); + } + + + + @Test + public void multimodalImageCutterTest() throws Exception{ + + JCas aCas = JCasFactory.createJCas(); + + File videoFile = new File("D:/DUUIVideos/read/India_Street.jpg"); + if (videoFile.exists()) { + String encoded = org.apache.commons.codec.binary.Base64.encodeBase64String(org.apache.commons.io.FileUtils.readFileToByteArray(videoFile)); + String mimeType = Files.probeContentType(videoFile.toPath()); + System.out.println(mimeType); + aCas.setSofaDataString(encoded, mimeType); + }else{ + System.out.println(videoFile.getAbsolutePath() + " not found"); + return; + } + + int iWorkers = 1; + + DUUILuaContext ctx = new DUUILuaContext().withJsonLibrary(); + + DUUIComposer composer = new DUUIComposer() + .withSkipVerification(true) // wir überspringen die Verifikation aller Componenten =) + .withLuaContext(ctx) // wir setzen den definierten Kontext + .withWorkers(iWorkers); // wir geben dem Composer eine Anzahl an Threads mit. + + DUUIUIMADriver uima_driver = new DUUIUIMADriver(); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + DUUIDockerDriver dockerDriver = new DUUIDockerDriver(); + + // Hinzufügen der einzelnen Driver zum Composer + composer.addDriver(uima_driver, remoteDriver, dockerDriver); + + composer.add(new DUUIDockerDriver.Component("duui-yolo:latest") // Image detection + .withScale(iWorkers) + .withRunningAfterDestroy(true) + .build()); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, "C:/test/temp", + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1", + XmiWriter.PARAM_COMPRESSION, "GZIP")) + .build()); + + composer.run(aCas); + + MultimodalUtil.getSubImages(aCas).forEach(file -> { + try { + FileUtils.moveFile(new File(file.getAbsolutePath()), new File("C:/test/" + file.getName())); + } catch (IOException e) { + e.printStackTrace(); + } + }); + + //composer.run(aCas); + } + }