From 8b8ca2af1a19775680ef72d817ec46353cbf28ce Mon Sep 17 00:00:00 2001 From: Michael Vorburger Date: Mon, 23 Sep 2024 02:02:28 +0200 Subject: [PATCH] fix (core): Tika 0.7 Mapping Property Names to IRIs --- .../NamespaceRepositoryEnolaDefaults.java | 13 +++++ java/dev/enola/format/tika/CleanMetadata.java | 50 +++++++++++++++++++ .../enola/format/tika/TikaThingConverter.java | 48 ++++++++++++++---- java/dev/enola/thing/KIRI.java | 5 ++ 4 files changed, 105 insertions(+), 11 deletions(-) create mode 100644 java/dev/enola/format/tika/CleanMetadata.java diff --git a/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java b/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java index c8d21cffe..dc00d9d74 100644 --- a/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java +++ b/java/dev/enola/common/io/iri/namespace/NamespaceRepositoryEnolaDefaults.java @@ -30,7 +30,20 @@ public class NamespaceRepositoryEnolaDefaults { .store("rdfs", "http://www.w3.org/2000/01/rdf-schema#") .store("foaf", "http://xmlns.com/foaf/0.1/") .store("dc", "http://purl.org/dc/elements/1.1/") + .store("dcterms", "http://purl.org/dc/terms/") .store("owl", "http://www.w3.org/2002/07/owl#") .store("ex", "https://example.org/") + .store("epub", "http://www.idpf.org/2007/ops#") // TODO is it / or /# or # ?! + // https://idpf.github.io/epub-prefixes/packages/ + .store("a11y", "http://www.idpf.org/epub/vocab/package/a11y/#") + .store("epubsc", "http://idpf.org/epub/vocab/sc/#") + .store("marc", "http://id.loc.gov/vocabulary/") + .store("media", "http://www.idpf.org/epub/vocab/overlays/#") + .store("onix", "http://www.editeur.org/ONIX/book/codelists/current.html#") + .store("rendition", "http://www.idpf.org/vocab/rendition/#") + .store("msv", "http://www.idpf.org/epub/vocab/structure/magazine/#") + .store( + "prism", + "http://www.prismstandard.org/specifications/3.0/PRISM_CV_Spec_3.0.htm#") .build(); } diff --git a/java/dev/enola/format/tika/CleanMetadata.java b/java/dev/enola/format/tika/CleanMetadata.java new file mode 100644 index 000000000..efd6679fa --- /dev/null +++ b/java/dev/enola/format/tika/CleanMetadata.java @@ -0,0 +1,50 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * Copyright 2024 The Enola Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package dev.enola.format.tika; + +import com.google.common.collect.ImmutableMap; + +import dev.enola.thing.KIRI; + +import org.jspecify.annotations.Nullable; + +record CleanMetadata(@Nullable String iri, String... removeNames) { + + CleanMetadata() { + this(null); + } + + // TODO Read this from a configuration file (in TTL) loaded into the Store + + static ImmutableMap ALL = + ImmutableMap.of( + "dc:description", + new CleanMetadata(KIRI.DC.DESCRIPTION, "description"), + "dc:creator", + new CleanMetadata(KIRI.DC.CREATOR, "author"), + "Content-Language", + new CleanMetadata(KIRI.DC.LANGUAGE), + "Content-Type", + new CleanMetadata(KIRI.E.MEDIA_TYPE), + "Content-Encoding", + new CleanMetadata(), + "viewport", + new CleanMetadata(), + "generator", + new CleanMetadata("https://enola.dev/html/generator")); +} diff --git a/java/dev/enola/format/tika/TikaThingConverter.java b/java/dev/enola/format/tika/TikaThingConverter.java index 3679daf0a..6fc250b6e 100644 --- a/java/dev/enola/format/tika/TikaThingConverter.java +++ b/java/dev/enola/format/tika/TikaThingConverter.java @@ -21,6 +21,9 @@ import dev.enola.common.convert.ConversionException; import dev.enola.common.io.iri.URIs; +import dev.enola.common.io.iri.namespace.NamespaceConverter; +import dev.enola.common.io.iri.namespace.NamespaceConverterWithRepository; +import dev.enola.common.io.iri.namespace.NamespaceRepositoryEnolaDefaults; import dev.enola.common.io.resource.ReadableResource; import dev.enola.common.io.resource.ResourceProvider; import dev.enola.thing.Thing; @@ -30,24 +33,27 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import java.io.IOException; import java.net.URI; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; public class TikaThingConverter implements UriIntoThingConverter { - private static final Logger LOG = LoggerFactory.getLogger(TikaThingConverter.class); - private static final AutoDetectParser parser = new AutoDetectParser(); private final ResourceProvider rp; + private final NamespaceConverter namespaceConverter; + public TikaThingConverter(ResourceProvider resourceProvider) { this.rp = resourceProvider; + this.namespaceConverter = + new NamespaceConverterWithRepository(NamespaceRepositoryEnolaDefaults.INSTANCE); } @Override @@ -81,18 +87,38 @@ public boolean convertInto(ReadableResource resource, ThingsBuilder thingsBuilde } private void convertMetadata(Metadata metadata, Thing.Builder thing) { - // TODO Do better IRI conversions of some well-known names - // ... + final var properties = new HashMap(); + final var names = new ArrayList<>(List.of(metadata.names())); + while (!names.isEmpty()) { + final var name = names.remove(0); + if (name.startsWith("X-TIKA")) continue; - // Fallback - for (var name : metadata.names()) { - var value = + final var value = metadata.isMultiValued(name) ? ImmutableSet.copyOf(metadata.getValues(name)) : metadata.get(name); - var predicate = "https://enola.dev/tika/" + URIs.encode(name); - thing.set(predicate, value); + final var toClean = CleanMetadata.ALL.get(name); + if (toClean != null) { + var iri = toClean.iri(); + if (iri != null) properties.put(iri, value); + + var removeNames = List.of(toClean.removeNames()); + for (var removeName : removeNames) { + names.remove(removeName); + properties.remove(tikaMetadataNameToEnolaIRI(removeName)); + } + + } else { + var iri = namespaceConverter.toIRI(name); + if (!iri.equals(name)) properties.put(iri, value); + else properties.put(tikaMetadataNameToEnolaIRI(name), value); + } } + properties.forEach(thing::set); + } + + private String tikaMetadataNameToEnolaIRI(String name) { + return "https://enola.dev/tika/" + URIs.encode(name); } } diff --git a/java/dev/enola/thing/KIRI.java b/java/dev/enola/thing/KIRI.java index ee3684863..ed3358204 100644 --- a/java/dev/enola/thing/KIRI.java +++ b/java/dev/enola/thing/KIRI.java @@ -56,6 +56,8 @@ public static final class E { */ public static final String EMOJI = NS + "emoji"; + public static final String MEDIA_TYPE = NS + "mediaType"; + /** * URI of what something is 'based on', e.g. where it 'comes from' (source), such as where * e.g. a Thing was originally "loaded" from. This may be a list. @@ -131,6 +133,9 @@ public static final class DC { public static final String TITLE = NS + "title"; public static final String DESCRIPTION = NS + "description"; + public static final String CREATOR = NS + "creator"; + public static final String LANGUAGE = NS + "language"; + private DC() {} }