From 3a56cb1f7143452ca75a556a61ed46bc03d8bef2 Mon Sep 17 00:00:00 2001 From: Michael Vorburger Date: Mon, 23 Sep 2024 06:53:35 +0200 Subject: [PATCH] feat (core): Tika 0.8 include text content in RDF, and add EPUB & PDF namespaces --- docs/concepts/tika.md | 18 +++-- java/dev/enola/common/BUILD | 2 - .../dev/enola/common/StringBuilderWriter.java | 77 +++++++++++++++++++ .../NamespaceRepositoryEnolaDefaults.java | 34 +++++++- java/dev/enola/format/tika/BUILD | 1 + .../enola/format/tika/TikaThingConverter.java | 18 +++-- .../format/tika/TikaThingConverterTest.java | 6 ++ models/enola.dev/enola.ttl | 6 +- test-cli.bash | 5 +- 9 files changed, 146 insertions(+), 21 deletions(-) create mode 100644 java/dev/enola/common/StringBuilderWriter.java diff --git a/docs/concepts/tika.md b/docs/concepts/tika.md index b893c6267..ee7f78b0c 100644 --- a/docs/concepts/tika.md +++ b/docs/concepts/tika.md @@ -24,20 +24,26 @@ All of these formats are supported e.g. for conversions with [Rosetta](../use/ro + + ## HTML -```bash cd .././.. -$ ./enola -v rosetta --in test/test.html --out="fd:2?mediaType=text/turtle" -... +```bash +./enola -v rosetta --in test/test.html --out="fd:2?mediaType=text/turtle" ``` This works for remote HTTP as well, of course: -```bash cd .././.. -$ ./enola rosetta --http-scheme --in https://docs.enola.dev --out="fd:2?mediaType=text/turtle" -... +```bash +./enola rosetta --http-scheme --in https://docs.enola.dev --out="fd:2?mediaType=text/turtle" ``` ## EPUB Any `*.epub` (`application/epub+zip`). + +## Executable + +```bash +./enola -v rosetta --in /usr/lib64/libsane.so.1 --out="fd:2?mediaType=text/turtle" +``` diff --git a/java/dev/enola/common/BUILD b/java/dev/enola/common/BUILD index 7b1727cc5..af597b221 100644 --- a/java/dev/enola/common/BUILD +++ b/java/dev/enola/common/BUILD @@ -26,11 +26,9 @@ java_library( visibility = ["//:__subpackages__"], deps = [ "//tools/version", - "@maven//:com_google_errorprone_error_prone_annotations", "@maven//:com_google_guava_guava", "@maven//:dev_dirs_directories", "@maven//:org_jspecify_jspecify", - "@maven//:org_slf4j_slf4j_api", ], ) diff --git a/java/dev/enola/common/StringBuilderWriter.java b/java/dev/enola/common/StringBuilderWriter.java new file mode 100644 index 000000000..bc64e4a93 --- /dev/null +++ b/java/dev/enola/common/StringBuilderWriter.java @@ -0,0 +1,77 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2024 The Enola Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dev.enola.common; + +import java.io.IOException; +import java.io.Writer; + +/** + * {@link Writer} implementation that outputs to a {@link StringBuilder}. + * + *

This is alternative to {@link java.io.StringWriter}, which internally uses a {@link + * StringBuffer}. This is faster (!) - at the expense of not (!) being concurrency multi thread safe + * - which often is not required. + */ +public final class StringBuilderWriter extends Writer { + + private final StringBuilder builder = new StringBuilder(); + + @Override + public void write(String str) throws IOException { + builder.append(str); + } + + @Override + public void write(char[] cbuf) throws IOException { + builder.append(cbuf); + } + + @Override + public void write(char[] cbuf, int off, int len) throws IOException { + if (len == 0) return; + builder.append(cbuf, off, len); + } + + @Override + public void write(String str, int off, int len) throws IOException { + if (len == 0) return; + builder.append(str, off, off + len); // not off, len! + } + + @Override + public void write(int c) throws IOException { + builder.append((char) c); // sic! + } + + @Override + public Writer append(char c) throws IOException { + builder.append(c); + return this; + } + + @Override + public String toString() { + return builder.toString(); + } + + @Override + public void flush() throws IOException {} + + @Override + public void close() throws IOException {} +} diff --git a/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java b/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java index dc00d9d74..658f883e7 100644 --- a/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java +++ b/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java @@ -20,6 +20,7 @@ public class NamespaceRepositoryEnolaDefaults { // TODO Replace this with something which reads e.g. //models/enola.dev/namespaces.ttl + // Add Human Background Documentation Reference URLs to TTL, where IRI don't resolve to URL public static final NamespaceRepository INSTANCE = new NamespaceRepositoryBuilder() @@ -33,7 +34,12 @@ public class NamespaceRepositoryEnolaDefaults { .store("dcterms", "http://purl.org/dc/terms/") .store("owl", "http://www.w3.org/2002/07/owl#") .store("ex", "https://example.org/") - .store("epub", "http://www.idpf.org/2007/ops#") // TODO is it / or /# or # ?! + + // EPUB + // TODO Double check if correctly it is / or /# or # ?! + .store("epub", "http://www.idpf.org/2007/ops#") + .store("opf", "http://www.idpf.org/2007/opf#") + .store("calibre", "http://calibre.kovidgoyal.net/2009/metadata") // https://idpf.github.io/epub-prefixes/packages/ .store("a11y", "http://www.idpf.org/epub/vocab/package/a11y/#") .store("epubsc", "http://idpf.org/epub/vocab/sc/#") @@ -45,5 +51,31 @@ public class NamespaceRepositoryEnolaDefaults { .store( "prism", "http://www.prismstandard.org/specifications/3.0/PRISM_CV_Spec_3.0.htm#") + + // PDF, see https://developer.adobe.com/xmp/docs/XMPNamespaces/ + // https://developer.adobe.com/xmp/docs/XMPNamespaces/xmp/ + .store("xmp", "http://ns.adobe.com/xap/1.0/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpMM/ + .store("xmpMM", "http://ns.adobe.com/xap/1.0/mm/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpBJ/ + .store("xmpBJ", "http://ns.adobe.com/xap/1.0/bj/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpTPg/ + .store("xmpTPg", "https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpTPg/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpDM/ + .store("xmpDM", "http://ns.adobe.com/xmp/1.0/DynamicMedia/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpRights/ + .store("xmpRights", "http://ns.adobe.com/xap/1.0/rights/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/pdf/ + .store("pdf", "http://ns.adobe.com/pdf/1.3/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/photoshop/ + .store("photoshop", "http://ns.adobe.com/photoshop/1.0/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/crs/ + .store("crs", "http://ns.adobe.com/camera-raw-settings/1.0/") + // http://ns.adobe.com/exif/1.0/ + .store("exif", "http://ns.adobe.com/exif/1.0/") + // https://developer.adobe.com/xmp/docs/XMPNamespaces/tiff/ + .store("tiff", "http://ns.adobe.com/tiff/1.0/") + // http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/ + .store("Iptc4xmpCore", "http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/") .build(); } diff --git a/java/dev/enola/format/tika/BUILD b/java/dev/enola/format/tika/BUILD index 60ceb9a6c..f8e9aff8e 100644 --- a/java/dev/enola/format/tika/BUILD +++ b/java/dev/enola/format/tika/BUILD @@ -29,6 +29,7 @@ java_library( "@maven//:org_apache_tika_tika_parsers_standard_package", ], deps = [ + "//java/dev/enola/common", "//java/dev/enola/common/convert", "//java/dev/enola/common/io", "//java/dev/enola/thing:thing_java", diff --git a/java/dev/enola/format/tika/TikaThingConverter.java b/java/dev/enola/format/tika/TikaThingConverter.java index 6fc250b6e..2ff7de5d6 100644 --- a/java/dev/enola/format/tika/TikaThingConverter.java +++ b/java/dev/enola/format/tika/TikaThingConverter.java @@ -19,6 +19,7 @@ import com.google.common.collect.ImmutableSet; +import dev.enola.common.StringBuilderWriter; import dev.enola.common.convert.ConversionException; import dev.enola.common.io.iri.URIs; import dev.enola.common.io.iri.namespace.NamespaceConverter; @@ -33,11 +34,12 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; -import org.xml.sax.ContentHandler; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; import java.io.IOException; +import java.io.Writer; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; @@ -68,17 +70,17 @@ public boolean convertInto(ReadableResource resource, ThingsBuilder thingsBuilde throws ConversionException, IOException { if (resource.byteSource().isEmpty()) return false; - // TODO Content? - // For debugging, e.g. use: - // BufferedWriter stdOut = new BufferedWriter(new OutputStreamWriter(System.out)); - // BodyContentHandler handler = new BodyContentHandler(stdOut); - ContentHandler handler = new DefaultHandler(); + Writer sw = new StringBuilderWriter(); + BodyContentHandler handler = new BodyContentHandler(sw); try (var is = resource.byteSource().openBufferedStream()) { Metadata metadata = new Metadata(); - parser.parse(is, handler, metadata); + ParseContext parseContext = new ParseContext(); + // TODO How to pass e.g. current Locale from TLC, e.g. for XLS parsing? + parser.parse(is, handler, metadata, parseContext); var thing = thingsBuilder.get(resource.uri().toString()); convertMetadata(metadata, thing); + thing.set("https://enola.dev/content-as-text", sw.toString()); return true; } catch (TikaException | SAXException e) { diff --git a/java/dev/enola/format/tika/TikaThingConverterTest.java b/java/dev/enola/format/tika/TikaThingConverterTest.java index 2ba0b2a31..e46148018 100644 --- a/java/dev/enola/format/tika/TikaThingConverterTest.java +++ b/java/dev/enola/format/tika/TikaThingConverterTest.java @@ -51,6 +51,12 @@ public void png() throws IOException { check("test.png"); } + // TODO @Test public void jpeg() throws IOException { + + // TODO @Test public void tiff() throws IOException { + + // TODO @Test public void epubEBook() throws IOException { + private void check(String classpath) throws IOException { var tb = new ThingsBuilder(); var c = new TikaThingConverter(new ClasspathResource.Provider()); diff --git a/models/enola.dev/enola.ttl b/models/enola.dev/enola.ttl index 27064404f..53d87824d 100644 --- a/models/enola.dev/enola.ttl +++ b/models/enola.dev/enola.ttl @@ -160,5 +160,9 @@ enola:rfc a rdf:Property; rdfs:range xsd:positiveInteger; schema:url "https://datatracker.ietf.org/doc/rfc{VALUE}"^^enola:IRITemplate. -enola:seeAlso a rdfs:Property; +enola:seeAlso a rdf:Property; rdfs:subPropertyOf rdfs:seeAlso. + +enola:content-as-text a rdf:Property; + rdfs:range xsd:string; + schema:description "The textual content of e.g. a file, or remote resource, such as PDF, HTML page, EPUB; without any metadata, markup, etc.". diff --git a/test-cli.bash b/test-cli.bash index ad18e6ddc..74019e730 100755 --- a/test-cli.bash +++ b/test-cli.bash @@ -20,8 +20,7 @@ set -euox pipefail # This script tests Enola CLI invocations. # See also EnolaCLITest -# TODO Add missing documentation for this ./enola get --load test/test.html enola:/inline - -# TODO Add missing documentation for this ./enola -v rosetta --in test/test.html --out="fd:2?mediaType=text/turtle" + +# PS: Update tika.md with anything (of interest) added here