Skip to content

Commit

Permalink
feat (core): Tika 0.8 include text content in RDF, and add EPUB & PDF…
Browse files Browse the repository at this point in the history
… namespaces
  • Loading branch information
vorburger committed Sep 24, 2024
1 parent 8b8ca2a commit 3a56cb1
Show file tree
Hide file tree
Showing 9 changed files with 146 additions and 21 deletions.
18 changes: 12 additions & 6 deletions docs/concepts/tika.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,26 @@ All of these formats are supported e.g. for conversions with [Rosetta](../use/ro

<!-- TODO Markdown?! With links, not just Metadata? -->

<!-- NB: The following commands are not run through ExecMD! Add to test-cli.bash... -->

## HTML

```bash cd .././..
$ ./enola -v rosetta --in test/test.html --out="fd:2?mediaType=text/turtle"
...
```bash
./enola -v rosetta --in test/test.html --out="fd:2?mediaType=text/turtle"
```

This works for remote HTTP as well, of course:

```bash cd .././..
$ ./enola rosetta --http-scheme --in https://docs.enola.dev --out="fd:2?mediaType=text/turtle"
...
```bash
./enola rosetta --http-scheme --in https://docs.enola.dev --out="fd:2?mediaType=text/turtle"
```

## EPUB

Any `*.epub` (`application/epub+zip`).

## Executable

```bash
./enola -v rosetta --in /usr/lib64/libsane.so.1 --out="fd:2?mediaType=text/turtle"
```
2 changes: 0 additions & 2 deletions java/dev/enola/common/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,9 @@ java_library(
visibility = ["//:__subpackages__"],
deps = [
"//tools/version",
"@maven//:com_google_errorprone_error_prone_annotations",
"@maven//:com_google_guava_guava",
"@maven//:dev_dirs_directories",
"@maven//:org_jspecify_jspecify",
"@maven//:org_slf4j_slf4j_api",
],
)

Expand Down
77 changes: 77 additions & 0 deletions java/dev/enola/common/StringBuilderWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.common;

import java.io.IOException;
import java.io.Writer;

/**
* {@link Writer} implementation that outputs to a {@link StringBuilder}.
*
* <p>This is alternative to {@link java.io.StringWriter}, which internally uses a {@link
* StringBuffer}. This is faster (!) - at the expense of not (!) being concurrency multi thread safe
* - which often is not required.
*/
public final class StringBuilderWriter extends Writer {

private final StringBuilder builder = new StringBuilder();

@Override
public void write(String str) throws IOException {
builder.append(str);
}

@Override
public void write(char[] cbuf) throws IOException {
builder.append(cbuf);
}

@Override
public void write(char[] cbuf, int off, int len) throws IOException {
if (len == 0) return;
builder.append(cbuf, off, len);
}

@Override
public void write(String str, int off, int len) throws IOException {
if (len == 0) return;
builder.append(str, off, off + len); // not off, len!
}

@Override
public void write(int c) throws IOException {
builder.append((char) c); // sic!
}

@Override
public Writer append(char c) throws IOException {
builder.append(c);
return this;
}

@Override
public String toString() {
return builder.toString();
}

@Override
public void flush() throws IOException {}

@Override
public void close() throws IOException {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
public class NamespaceRepositoryEnolaDefaults {

// TODO Replace this with something which reads e.g. //models/enola.dev/namespaces.ttl
// Add Human Background Documentation Reference URLs to TTL, where IRI don't resolve to URL

public static final NamespaceRepository INSTANCE =
new NamespaceRepositoryBuilder()
Expand All @@ -33,7 +34,12 @@ public class NamespaceRepositoryEnolaDefaults {
.store("dcterms", "http://purl.org/dc/terms/")
.store("owl", "http://www.w3.org/2002/07/owl#")
.store("ex", "https://example.org/")
.store("epub", "http://www.idpf.org/2007/ops#") // TODO is it / or /# or # ?!

// EPUB
// TODO Double check if correctly it is / or /# or # ?!
.store("epub", "http://www.idpf.org/2007/ops#")
.store("opf", "http://www.idpf.org/2007/opf#")
.store("calibre", "http://calibre.kovidgoyal.net/2009/metadata")
// https://idpf.github.io/epub-prefixes/packages/
.store("a11y", "http://www.idpf.org/epub/vocab/package/a11y/#")
.store("epubsc", "http://idpf.org/epub/vocab/sc/#")
Expand All @@ -45,5 +51,31 @@ public class NamespaceRepositoryEnolaDefaults {
.store(
"prism",
"http://www.prismstandard.org/specifications/3.0/PRISM_CV_Spec_3.0.htm#")

// PDF, see https://developer.adobe.com/xmp/docs/XMPNamespaces/
// https://developer.adobe.com/xmp/docs/XMPNamespaces/xmp/
.store("xmp", "http://ns.adobe.com/xap/1.0/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpMM/
.store("xmpMM", "http://ns.adobe.com/xap/1.0/mm/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpBJ/
.store("xmpBJ", "http://ns.adobe.com/xap/1.0/bj/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpTPg/
.store("xmpTPg", "https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpTPg/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpDM/
.store("xmpDM", "http://ns.adobe.com/xmp/1.0/DynamicMedia/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/xmpRights/
.store("xmpRights", "http://ns.adobe.com/xap/1.0/rights/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/pdf/
.store("pdf", "http://ns.adobe.com/pdf/1.3/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/photoshop/
.store("photoshop", "http://ns.adobe.com/photoshop/1.0/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/crs/
.store("crs", "http://ns.adobe.com/camera-raw-settings/1.0/")
// http://ns.adobe.com/exif/1.0/
.store("exif", "http://ns.adobe.com/exif/1.0/")
// https://developer.adobe.com/xmp/docs/XMPNamespaces/tiff/
.store("tiff", "http://ns.adobe.com/tiff/1.0/")
// http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/
.store("Iptc4xmpCore", "http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/")
.build();
}
1 change: 1 addition & 0 deletions java/dev/enola/format/tika/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ java_library(
"@maven//:org_apache_tika_tika_parsers_standard_package",
],
deps = [
"//java/dev/enola/common",
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"//java/dev/enola/thing:thing_java",
Expand Down
18 changes: 10 additions & 8 deletions java/dev/enola/format/tika/TikaThingConverter.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import com.google.common.collect.ImmutableSet;

import dev.enola.common.StringBuilderWriter;
import dev.enola.common.convert.ConversionException;
import dev.enola.common.io.iri.URIs;
import dev.enola.common.io.iri.namespace.NamespaceConverter;
Expand All @@ -33,11 +34,12 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.xml.sax.ContentHandler;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.IOException;
import java.io.Writer;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -68,17 +70,17 @@ public boolean convertInto(ReadableResource resource, ThingsBuilder thingsBuilde
throws ConversionException, IOException {
if (resource.byteSource().isEmpty()) return false;

// TODO Content?
// For debugging, e.g. use:
// BufferedWriter stdOut = new BufferedWriter(new OutputStreamWriter(System.out));
// BodyContentHandler handler = new BodyContentHandler(stdOut);
ContentHandler handler = new DefaultHandler();
Writer sw = new StringBuilderWriter();
BodyContentHandler handler = new BodyContentHandler(sw);

try (var is = resource.byteSource().openBufferedStream()) {
Metadata metadata = new Metadata();
parser.parse(is, handler, metadata);
ParseContext parseContext = new ParseContext();
// TODO How to pass e.g. current Locale from TLC, e.g. for XLS parsing?
parser.parse(is, handler, metadata, parseContext);
var thing = thingsBuilder.get(resource.uri().toString());
convertMetadata(metadata, thing);
thing.set("https://enola.dev/content-as-text", sw.toString());
return true;

} catch (TikaException | SAXException e) {
Expand Down
6 changes: 6 additions & 0 deletions java/dev/enola/format/tika/TikaThingConverterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ public void png() throws IOException {
check("test.png");
}

// TODO @Test public void jpeg() throws IOException {

// TODO @Test public void tiff() throws IOException {

// TODO @Test public void epubEBook() throws IOException {

private void check(String classpath) throws IOException {
var tb = new ThingsBuilder();
var c = new TikaThingConverter(new ClasspathResource.Provider());
Expand Down
6 changes: 5 additions & 1 deletion models/enola.dev/enola.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -160,5 +160,9 @@ enola:rfc a rdf:Property;
rdfs:range xsd:positiveInteger;
schema:url "https://datatracker.ietf.org/doc/rfc{VALUE}"^^enola:IRITemplate.

enola:seeAlso a rdfs:Property;
enola:seeAlso a rdf:Property;
rdfs:subPropertyOf rdfs:seeAlso.

enola:content-as-text a rdf:Property;
rdfs:range xsd:string;
schema:description "The textual content of e.g. a file, or remote resource, such as PDF, HTML page, EPUB; without any metadata, markup, etc.".
5 changes: 2 additions & 3 deletions test-cli.bash
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ set -euox pipefail
# This script tests Enola CLI invocations.
# See also EnolaCLITest

# TODO Add missing documentation for this
./enola get --load test/test.html enola:/inline

# TODO Add missing documentation for this
./enola -v rosetta --in test/test.html --out="fd:2?mediaType=text/turtle"

# PS: Update tika.md with anything (of interest) added here

0 comments on commit 3a56cb1

Please sign in to comment.