Skip to content

Commit

Permalink
feat (core): Introduce XmlThingConverter
Browse files Browse the repository at this point in the history
  • Loading branch information
vorburger committed Sep 28, 2024
1 parent 626a42f commit c763dd6
Show file tree
Hide file tree
Showing 38 changed files with 660 additions and 25 deletions.
4 changes: 4 additions & 0 deletions ToDo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
label: Enola.dev ToDo
type: https://enola.dev/todo/List
items:
- Update W3C Wiki with links to Enola docs:
links:
- https://www.w3.org/wiki/ConverterToRdf

- Last `null` NPE, for Enola: &lastNPE
links:
- https://github.com/enola-dev/enola/issues/845
Expand Down
1 change: 1 addition & 0 deletions java/dev/enola/cli/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ java_binary(
"//java/dev/enola/data",
"//java/dev/enola/datatype",
"//java/dev/enola/format/tika",
"//java/dev/enola/format/xml",
"//java/dev/enola/model",
"//java/dev/enola/rdf/io",
"//java/dev/enola/thing:thing_java",
Expand Down
2 changes: 2 additions & 0 deletions java/dev/enola/cli/CommandWithResourceProviderAndLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package dev.enola.cli;

import dev.enola.format.tika.TikaThingConverter;
import dev.enola.format.xml.XmlThingConverter;
import dev.enola.model.enola.files.FileThingConverter;
import dev.enola.rdf.io.RdfResourceIntoThingConverter;
import dev.enola.thing.io.Loader;
Expand Down Expand Up @@ -53,6 +54,7 @@ public abstract class CommandWithResourceProviderAndLoader extends CommandWithRe
protected Loader loader() {
var uriIntoThingConverters = new ArrayList<UriIntoThingConverter>(2);
uriIntoThingConverters.add(new RdfResourceIntoThingConverter<>());
uriIntoThingConverters.add(new XmlThingConverter(rp));
if (fileLoader) uriIntoThingConverters.add(new FileThingConverter());
if (tikaLoader) uriIntoThingConverters.add(new TikaThingConverter(rp));

Expand Down
6 changes: 4 additions & 2 deletions java/dev/enola/cli/EnolaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ public int handleExecutionException(
cmd.getErr().print(cmd.getColorScheme().optionText(intro));
Throwable e = ex;
while (e != null) {
var msg = /*e.getClass().getSimpleName() + ": " + */ e.getMessage();
cmd.getErr().println(cmd.getColorScheme().errorText(msg));
var type = e.getClass().getSimpleName();
var msg = e.getMessage();
var full = type + (msg != null ? ": " + msg : "");
cmd.getErr().println(cmd.getColorScheme().errorText(full));
e = e.getCause();
if (e != null) {
cmd.getErr().print("caused by: ");
Expand Down
1 change: 1 addition & 0 deletions java/dev/enola/common/io/iri/URIs.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public static MediaTypeAndOrCharset getMediaTypeAndCharset(URI uri) {
var queryMap = getQueryMap(uri);
var charsetParameter = queryMap.get(CHARSET);
var mediaTypeParameter = queryMap.get(MEDIA_TYPE.toLowerCase());
if (mediaTypeParameter == null) mediaTypeParameter = queryMap.get(MEDIA_TYPE);
return new MediaTypeAndOrCharset(mediaTypeParameter, charsetParameter);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@
*/
package dev.enola.common.io.mediatype;

import static com.google.common.net.MediaType.JSON_UTF_8;
import static com.google.common.net.MediaType.OCTET_STREAM;
import static com.google.common.net.MediaType.PLAIN_TEXT_UTF_8;
import static com.google.common.net.MediaType.*;
import static com.google.common.truth.Truth.assertThat;

import static dev.enola.common.io.mediatype.YamlMediaType.YAML_UTF_8;
Expand Down
1 change: 1 addition & 0 deletions java/dev/enola/common/xml/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ java_library(
plugins = ["//tools/bazel/java_plugin:autoservice"],
visibility = ["//:__subpackages__"],
deps = [
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"@maven//:com_google_auto_service_auto_service_annotations",
"@maven//:com_google_guava_guava",
Expand Down
3 changes: 3 additions & 0 deletions java/dev/enola/common/xml/XML.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ public static void canonicalize(ReadableResource in, WritableResource out, boole

private static String normalizeXML(InputStream inputStream, boolean format)
throws ParserConfigurationException, IOException, SAXException, TransformerException {

// TODO Use streaming SAX instead of DOM; and break this up... use XmlResourceParser

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setAttribute(XMLConstants.FEATURE_SECURE_PROCESSING, true); // #security
DocumentBuilder builder = factory.newDocumentBuilder();
Expand Down
13 changes: 13 additions & 0 deletions java/dev/enola/common/xml/XmlMediaTypeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

import org.junit.Test;

import java.net.URI;

public class XmlMediaTypeTest {

@Test
Expand All @@ -36,4 +38,15 @@ public void xmlMediaType() {
var mediaType = resource.mediaType();
assertThat(normalizedNoParamsEquals(resource.mediaType(), MediaType.XML_UTF_8)).isTrue();
}

@Test
public void testStrangeBug() {
var rp = new ClasspathResource.Provider();
var ok = rp.getReadableResource(URI.create("classpath:/greeting1-nested.xml"));
assertThat(normalizedNoParamsEquals(ok.mediaType(), MediaType.XML_UTF_8)).isTrue();

// This XML used to start with <!-- comment and without <?xml and was HTML instead of XML
var nok = rp.getReadableResource(URI.create("classpath:/greeting1-attribute.xml"));
assertThat(normalizedNoParamsEquals(nok.mediaType(), MediaType.XML_UTF_8)).isTrue();
}
}
51 changes: 51 additions & 0 deletions java/dev/enola/common/xml/XmlResourceParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.common.xml;

import com.google.common.net.MediaType;

import dev.enola.common.convert.CatchingConverterInto;
import dev.enola.common.io.mediatype.MediaTypes;
import dev.enola.common.io.resource.ReadableResource;

import org.xml.sax.helpers.DefaultHandler;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

public class XmlResourceParser implements CatchingConverterInto<ReadableResource, DefaultHandler> {

@Override
public boolean convertIntoThrows(ReadableResource from, DefaultHandler into) throws Exception {
if (!MediaTypes.normalizedNoParamsEquals(from.mediaType(), MediaType.XML_UTF_8))
return false;

if (from.byteSource().isEmpty()) return true;

SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
factory.setXIncludeAware(false);
factory.setValidating(false);
SAXParser saxParser = factory.newSAXParser();

try (var is = from.byteSource().openStream()) {
saxParser.parse(is, into);
}
return true;
}
}
4 changes: 3 additions & 1 deletion java/dev/enola/core/rosetta/Rosetta.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ public Rosetta(ResourceProvider rp, Loader loader) {
new GraphvizResourceConverter(loader, new GraphvizGenerator(tmp)),
new GexfResourceConverter(loader, new GexfGenerator(tmp)),
new CharResourceConverter(),
new IdempotentCopyingResourceNonConverter()));
new IdempotentCopyingResourceNonConverter()
// TODO XML... with XmlResourceParser & XMLToThingHandler
));
}

@Override
Expand Down
9 changes: 8 additions & 1 deletion java/dev/enola/data/Repository.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
package dev.enola.data;

import com.google.common.collect.Iterables;
import com.google.common.collect.Streams;

import java.util.stream.Stream;

/**
* Repository is a Provider which, in addition to being able to getting a single T given an IRI, can
Expand All @@ -40,6 +43,10 @@ public interface Repository<T> extends ProviderFromIRI<T> {
* implementation?
*/
default Iterable<T> list() {
return Iterables.transform(listIRI(), iri -> get(iri));
return Iterables.transform(listIRI(), this::get);
}

default Stream<T> stream() {
return Streams.stream(listIRI()).map(this::get);
}
}
7 changes: 6 additions & 1 deletion java/dev/enola/datatype/DatatypeRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,12 @@ public Iterable<String> listIRI() {

@Override
public @Nullable Datatype<?> get(String iri) {
return TLC.get(DatatypeRepository.class).get(iri);
return TLC.optional(DatatypeRepository.class)
.orElseThrow(
() ->
new IllegalStateException(
"Need DatatypeRepository in TLC for: " + iri))
.get(iri);
}
};
}
1 change: 1 addition & 0 deletions java/dev/enola/format/tika/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ java_library(
"//java/dev/enola/common",
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"//java/dev/enola/format/xml",
"//java/dev/enola/thing:thing_java",
"@maven//:com_google_auto_service_auto_service_annotations",
"@maven//:com_google_errorprone_error_prone_annotations",
Expand Down
24 changes: 20 additions & 4 deletions java/dev/enola/format/tika/TikaThingConverter.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package dev.enola.format.tika;

import com.google.common.collect.ImmutableSet;
import com.google.common.net.MediaType;

import dev.enola.common.StringBuilderWriter;
import dev.enola.common.convert.ConversionException;
Expand All @@ -36,6 +37,8 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.IOException;
Expand All @@ -44,9 +47,17 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;

public class TikaThingConverter implements UriIntoThingConverter {

private static final Set<MediaType> IGNORED =
ImmutableSet.of(
MediaType.XML_UTF_8.withoutParameters(),
MediaType.parse("application/xml"),
MediaType.parse("text/turtle"),
MediaType.parse("application/x-turtle"));

private static final AutoDetectParser parser = new AutoDetectParser();
private final ResourceProvider rp;

Expand All @@ -69,19 +80,24 @@ public boolean convertInto(URI from, ThingsBuilder thingsBuilder)
public boolean convertInto(ReadableResource resource, ThingsBuilder thingsBuilder)
throws ConversionException, IOException {
if (resource.byteSource().isEmpty()) return false;
// TODO Improve detection of on when Tika can actually process content...
if (IGNORED.contains(resource.mediaType().withoutParameters())) return false;

var thingBuilder = thingsBuilder.getBuilder(resource.uri().toString());
var iri = resource.uri().toString();
Writer sw = new StringBuilderWriter();
BodyContentHandler handler = new BodyContentHandler(sw);
// var thingsHandler = new XMLToThingHandler(iri, thingBuilder);
ContentHandler handler =
new TeeContentHandler(new BodyContentHandler(sw) /* TODO, thingsHandler*/);

try (var is = resource.byteSource().openBufferedStream()) {
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
// TODO How to pass e.g. current Locale from TLC, e.g. for XLS parsing?
parser.parse(is, handler, metadata, parseContext);
var thing = thingsBuilder.getBuilder(resource.uri().toString());
convertMetadata(metadata, thing);
convertMetadata(metadata, thingBuilder);
var text = sw.toString().trim();
if (!text.isEmpty()) thing.set("https://enola.dev/content-as-text", text);
if (!text.isEmpty()) thingBuilder.set("https://enola.dev/content-as-text", text);
return true;

} catch (TikaException | SAXException e) {
Expand Down
61 changes: 61 additions & 0 deletions java/dev/enola/format/xml/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
#
# Copyright 2023 The Enola <https://enola.dev> Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

load("@rules_java//java:defs.bzl", "java_library")
load("//tools/bazel:junit.bzl", "junit_tests")

java_library(
name = "xml",
srcs = glob(
["*.java"],
exclude = [
"*Test.java",
"Test*.java",
],
),
plugins = ["//tools/bazel/java_plugin:autoservice"],
resource_strip_prefix = "java/",
resources = glob(["**/*.html"]),
visibility = ["//:__subpackages__"],
deps = [
"//java/dev/enola/common/context",
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"//java/dev/enola/common/xml",
"//java/dev/enola/thing:thing_java",
"@maven//:com_google_auto_service_auto_service_annotations",
"@maven//:com_google_errorprone_error_prone_annotations",
"@maven//:com_google_errorprone_error_prone_type_annotations",
"@maven//:com_google_guava_guava",
"@maven//:org_jspecify_jspecify",
"@maven//:org_slf4j_slf4j_api",
],
)

junit_tests(
name = "tests",
srcs = glob([
"*Test.java",
]),
deps = [
":xml",
"//java/dev/enola/common/context",
"//java/dev/enola/common/io",
"//java/dev/enola/thing:thing_java",
"//java/dev/enola/thing/testlib",
"//test",
],
)
Loading

0 comments on commit c763dd6

Please sign in to comment.