Skip to content

Commit

Permalink
fix (core): Tika 0.7 Mapping Property Names to IRIs
Browse files Browse the repository at this point in the history
  • Loading branch information
vorburger committed Sep 23, 2024
1 parent 3bbb11a commit 8b8ca2a
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,20 @@ public class NamespaceRepositoryEnolaDefaults {
.store("rdfs", "http://www.w3.org/2000/01/rdf-schema#")
.store("foaf", "http://xmlns.com/foaf/0.1/")
.store("dc", "http://purl.org/dc/elements/1.1/")
.store("dcterms", "http://purl.org/dc/terms/")
.store("owl", "http://www.w3.org/2002/07/owl#")
.store("ex", "https://example.org/")
.store("epub", "http://www.idpf.org/2007/ops#") // TODO is it / or /# or # ?!
// https://idpf.github.io/epub-prefixes/packages/
.store("a11y", "http://www.idpf.org/epub/vocab/package/a11y/#")
.store("epubsc", "http://idpf.org/epub/vocab/sc/#")
.store("marc", "http://id.loc.gov/vocabulary/")
.store("media", "http://www.idpf.org/epub/vocab/overlays/#")
.store("onix", "http://www.editeur.org/ONIX/book/codelists/current.html#")
.store("rendition", "http://www.idpf.org/vocab/rendition/#")
.store("msv", "http://www.idpf.org/epub/vocab/structure/magazine/#")
.store(
"prism",
"http://www.prismstandard.org/specifications/3.0/PRISM_CV_Spec_3.0.htm#")
.build();
}
50 changes: 50 additions & 0 deletions java/dev/enola/format/tika/CleanMetadata.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.format.tika;

import com.google.common.collect.ImmutableMap;

import dev.enola.thing.KIRI;

import org.jspecify.annotations.Nullable;

record CleanMetadata(@Nullable String iri, String... removeNames) {

CleanMetadata() {
this(null);
}

// TODO Read this from a configuration file (in TTL) loaded into the Store

static ImmutableMap<String, CleanMetadata> ALL =
ImmutableMap.of(
"dc:description",
new CleanMetadata(KIRI.DC.DESCRIPTION, "description"),
"dc:creator",
new CleanMetadata(KIRI.DC.CREATOR, "author"),
"Content-Language",
new CleanMetadata(KIRI.DC.LANGUAGE),
"Content-Type",
new CleanMetadata(KIRI.E.MEDIA_TYPE),
"Content-Encoding",
new CleanMetadata(),
"viewport",
new CleanMetadata(),
"generator",
new CleanMetadata("https://enola.dev/html/generator"));
}
48 changes: 37 additions & 11 deletions java/dev/enola/format/tika/TikaThingConverter.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

import dev.enola.common.convert.ConversionException;
import dev.enola.common.io.iri.URIs;
import dev.enola.common.io.iri.namespace.NamespaceConverter;
import dev.enola.common.io.iri.namespace.NamespaceConverterWithRepository;
import dev.enola.common.io.iri.namespace.NamespaceRepositoryEnolaDefaults;
import dev.enola.common.io.resource.ReadableResource;
import dev.enola.common.io.resource.ResourceProvider;
import dev.enola.thing.Thing;
Expand All @@ -30,24 +33,27 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

public class TikaThingConverter implements UriIntoThingConverter {

private static final Logger LOG = LoggerFactory.getLogger(TikaThingConverter.class);

private static final AutoDetectParser parser = new AutoDetectParser();
private final ResourceProvider rp;

private final NamespaceConverter namespaceConverter;

public TikaThingConverter(ResourceProvider resourceProvider) {
this.rp = resourceProvider;
this.namespaceConverter =
new NamespaceConverterWithRepository(NamespaceRepositoryEnolaDefaults.INSTANCE);
}

@Override
Expand Down Expand Up @@ -81,18 +87,38 @@ public boolean convertInto(ReadableResource resource, ThingsBuilder thingsBuilde
}

private void convertMetadata(Metadata metadata, Thing.Builder<?> thing) {
// TODO Do better IRI conversions of some well-known names
// ...
final var properties = new HashMap<String, Object>();
final var names = new ArrayList<>(List.of(metadata.names()));
while (!names.isEmpty()) {
final var name = names.remove(0);
if (name.startsWith("X-TIKA")) continue;

// Fallback
for (var name : metadata.names()) {
var value =
final var value =
metadata.isMultiValued(name)
? ImmutableSet.copyOf(metadata.getValues(name))
: metadata.get(name);

var predicate = "https://enola.dev/tika/" + URIs.encode(name);
thing.set(predicate, value);
final var toClean = CleanMetadata.ALL.get(name);
if (toClean != null) {
var iri = toClean.iri();
if (iri != null) properties.put(iri, value);

var removeNames = List.of(toClean.removeNames());
for (var removeName : removeNames) {
names.remove(removeName);
properties.remove(tikaMetadataNameToEnolaIRI(removeName));
}

} else {
var iri = namespaceConverter.toIRI(name);
if (!iri.equals(name)) properties.put(iri, value);
else properties.put(tikaMetadataNameToEnolaIRI(name), value);
}
}
properties.forEach(thing::set);
}

private String tikaMetadataNameToEnolaIRI(String name) {
return "https://enola.dev/tika/" + URIs.encode(name);
}
}
5 changes: 5 additions & 0 deletions java/dev/enola/thing/KIRI.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ public static final class E {
*/
public static final String EMOJI = NS + "emoji";

public static final String MEDIA_TYPE = NS + "mediaType";

/**
* URI of what something is 'based on', e.g. where it 'comes from' (source), such as where
* e.g. a Thing was originally "loaded" from. This may be a list.
Expand Down Expand Up @@ -131,6 +133,9 @@ public static final class DC {
public static final String TITLE = NS + "title";
public static final String DESCRIPTION = NS + "description";

public static final String CREATOR = NS + "creator";
public static final String LANGUAGE = NS + "language";

private DC() {}
}

Expand Down

0 comments on commit 8b8ca2a

Please sign in to comment.