Skip to content

Commit

Permalink
feat (core): Introduce XmlThingConverter
Browse files Browse the repository at this point in the history
  • Loading branch information
vorburger committed Sep 28, 2024
1 parent 2b7f151 commit 2f25ea9
Show file tree
Hide file tree
Showing 29 changed files with 538 additions and 15 deletions.
4 changes: 4 additions & 0 deletions ToDo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
label: Enola.dev ToDo
type: https://enola.dev/todo/List
items:
- Update W3C Wiki with links to Enola docs:
links:
- https://www.w3.org/wiki/ConverterToRdf

- Last `null` NPE, for Enola: &lastNPE
links:
- https://github.com/enola-dev/enola/issues/845
Expand Down
1 change: 1 addition & 0 deletions java/dev/enola/common/io/iri/URIs.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public static MediaTypeAndOrCharset getMediaTypeAndCharset(URI uri) {
var queryMap = getQueryMap(uri);
var charsetParameter = queryMap.get(CHARSET);
var mediaTypeParameter = queryMap.get(MEDIA_TYPE.toLowerCase());
if (mediaTypeParameter == null) mediaTypeParameter = queryMap.get(MEDIA_TYPE);
return new MediaTypeAndOrCharset(mediaTypeParameter, charsetParameter);
}

Expand Down
1 change: 1 addition & 0 deletions java/dev/enola/common/xml/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ java_library(
plugins = ["//tools/bazel/java_plugin:autoservice"],
visibility = ["//:__subpackages__"],
deps = [
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"@maven//:com_google_auto_service_auto_service_annotations",
"@maven//:com_google_guava_guava",
Expand Down
3 changes: 3 additions & 0 deletions java/dev/enola/common/xml/XML.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ public static void canonicalize(ReadableResource in, WritableResource out, boole

private static String normalizeXML(InputStream inputStream, boolean format)
throws ParserConfigurationException, IOException, SAXException, TransformerException {

// TODO Use streaming SAX instead of DOM; and break this up... use XmlResourceParser

DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setAttribute(XMLConstants.FEATURE_SECURE_PROCESSING, true); // #security
DocumentBuilder builder = factory.newDocumentBuilder();
Expand Down
51 changes: 51 additions & 0 deletions java/dev/enola/common/xml/XmlResourceParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.common.xml;

import com.google.common.net.MediaType;

import dev.enola.common.convert.CatchingConverterInto;
import dev.enola.common.io.mediatype.MediaTypes;
import dev.enola.common.io.resource.ReadableResource;

import org.xml.sax.helpers.DefaultHandler;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

public class XmlResourceParser implements CatchingConverterInto<ReadableResource, DefaultHandler> {

@Override
public boolean convertIntoThrows(ReadableResource from, DefaultHandler into) throws Exception {
if (!MediaTypes.normalizedNoParamsEquals(from.mediaType(), MediaType.XML_UTF_8))
return false;

if (from.byteSource().isEmpty()) return true;

SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
factory.setXIncludeAware(false);
factory.setValidating(false);
SAXParser saxParser = factory.newSAXParser();

try (var is = from.byteSource().openStream()) {
saxParser.parse(is, into);
}
return true;
}
}
4 changes: 3 additions & 1 deletion java/dev/enola/core/rosetta/Rosetta.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ public Rosetta(ResourceProvider rp, Loader loader) {
new GraphvizResourceConverter(loader, new GraphvizGenerator(tmp)),
new GexfResourceConverter(loader, new GexfGenerator(tmp)),
new CharResourceConverter(),
new IdempotentCopyingResourceNonConverter()));
new IdempotentCopyingResourceNonConverter()
// TODO XML... with XmlResourceParser & XMLToThingHandler
));
}

@Override
Expand Down
9 changes: 8 additions & 1 deletion java/dev/enola/data/Repository.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
package dev.enola.data;

import com.google.common.collect.Iterables;
import com.google.common.collect.Streams;

import java.util.stream.Stream;

/**
* Repository is a Provider which, in addition to being able to getting a single T given an IRI, can
Expand All @@ -40,6 +43,10 @@ public interface Repository<T> extends ProviderFromIRI<T> {
* implementation?
*/
default Iterable<T> list() {
return Iterables.transform(listIRI(), iri -> get(iri));
return Iterables.transform(listIRI(), this::get);
}

default Stream<T> stream() {
return Streams.stream(listIRI()).map(this::get);
}
}
7 changes: 6 additions & 1 deletion java/dev/enola/datatype/DatatypeRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,12 @@ public Iterable<String> listIRI() {

@Override
public @Nullable Datatype<?> get(String iri) {
return TLC.get(DatatypeRepository.class).get(iri);
return TLC.optional(DatatypeRepository.class)
.orElseThrow(
() ->
new IllegalStateException(
"Need DatatypeRepository in TLC for: " + iri))
.get(iri);
}
};
}
1 change: 1 addition & 0 deletions java/dev/enola/format/tika/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ java_library(
"//java/dev/enola/common",
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"//java/dev/enola/format/xml",
"//java/dev/enola/thing:thing_java",
"@maven//:com_google_auto_service_auto_service_annotations",
"@maven//:com_google_errorprone_error_prone_annotations",
Expand Down
13 changes: 9 additions & 4 deletions java/dev/enola/format/tika/TikaThingConverter.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import dev.enola.common.io.iri.namespace.NamespaceRepositoryEnolaDefaults;
import dev.enola.common.io.resource.ReadableResource;
import dev.enola.common.io.resource.ResourceProvider;
import dev.enola.format.xml.XMLToThingHandler;
import dev.enola.thing.Thing;
import dev.enola.thing.io.UriIntoThingConverter;
import dev.enola.thing.repo.ThingsBuilder;
Expand All @@ -36,6 +37,8 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.IOException;
Expand Down Expand Up @@ -69,19 +72,21 @@ public boolean convertInto(URI from, ThingsBuilder thingsBuilder)
public boolean convertInto(ReadableResource resource, ThingsBuilder thingsBuilder)
throws ConversionException, IOException {
if (resource.byteSource().isEmpty()) return false;
var thingBuilder = thingsBuilder.getBuilder(resource.uri().toString());

var iri = resource.uri().toString();
Writer sw = new StringBuilderWriter();
BodyContentHandler handler = new BodyContentHandler(sw);
var thingsHandler = new XMLToThingHandler(iri, thingBuilder);
ContentHandler handler = new TeeContentHandler(new BodyContentHandler(sw), thingsHandler);

try (var is = resource.byteSource().openBufferedStream()) {
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
// TODO How to pass e.g. current Locale from TLC, e.g. for XLS parsing?
parser.parse(is, handler, metadata, parseContext);
var thing = thingsBuilder.getBuilder(resource.uri().toString());
convertMetadata(metadata, thing);
convertMetadata(metadata, thingBuilder);
var text = sw.toString().trim();
if (!text.isEmpty()) thing.set("https://enola.dev/content-as-text", text);
if (!text.isEmpty()) thingBuilder.set("https://enola.dev/content-as-text", text);
return true;

} catch (TikaException | SAXException e) {
Expand Down
59 changes: 59 additions & 0 deletions java/dev/enola/format/xml/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
#
# Copyright 2023 The Enola <https://enola.dev> Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

load("@rules_java//java:defs.bzl", "java_library")
load("//tools/bazel:junit.bzl", "junit_tests")

java_library(
name = "xml",
srcs = glob(
["*.java"],
exclude = [
"*Test.java",
"Test*.java",
],
),
plugins = ["//tools/bazel/java_plugin:autoservice"],
resource_strip_prefix = "java/",
resources = glob(["**/*.html"]),
visibility = ["//:__subpackages__"],
deps = [
"//java/dev/enola/common/convert",
"//java/dev/enola/common/io",
"//java/dev/enola/common/xml",
"//java/dev/enola/thing:thing_java",
"@maven//:com_google_auto_service_auto_service_annotations",
"@maven//:com_google_errorprone_error_prone_annotations",
"@maven//:com_google_errorprone_error_prone_type_annotations",
"@maven//:com_google_guava_guava",
"@maven//:org_jspecify_jspecify",
"@maven//:org_slf4j_slf4j_api",
],
)

junit_tests(
name = "tests",
srcs = glob([
"*Test.java",
]),
deps = [
":xml",
"//java/dev/enola/common/io",
"//java/dev/enola/thing:thing_java",
"//java/dev/enola/thing/testlib",
"//test",
],
)
135 changes: 135 additions & 0 deletions java/dev/enola/format/xml/XMLToThingHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.format.xml;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;

import dev.enola.common.io.iri.namespace.NamespaceRepository;
import dev.enola.common.io.iri.namespace.NamespaceRepositoryBuilder;
import dev.enola.thing.Thing;
import dev.enola.thing.impl.IImmutablePredicatesObjects;
import dev.enola.thing.impl.ImmutablePredicatesObjects;

import org.jspecify.annotations.Nullable;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

import java.util.ArrayDeque;
import java.util.Deque;

/**
* XML SAX {@link ContentHandler} which transforms XML into a {@link dev.enola.thing.Thing}.
*
* <p>This is NOT thread-safe!
*/
public class XMLToThingHandler extends DefaultHandler {

// TODO Consider implementing this via & through the existing JSON support instead?

private final String iri;
private final NamespaceRepositoryBuilder nrb = new NamespaceRepositoryBuilder();

private @Nullable String previousElementQName = null;
// TODO rm? private final
// List<List<IImmutablePredicatesObjects.Builder<IImmutablePredicatesObjects>>>
// builders = new ArrayList<>();
private final Deque<IImmutablePredicatesObjects.Builder<IImmutablePredicatesObjects>>
thingBuilders = new ArrayDeque<>();
private ImmutableList.@Nullable Builder<IImmutablePredicatesObjects> currentListBuilder;

@SuppressWarnings("unchecked")
public XMLToThingHandler(String iri, Thing.Builder<?> thingBuilder) {
this(iri, (IImmutablePredicatesObjects.Builder<IImmutablePredicatesObjects>) thingBuilder);
}

public XMLToThingHandler(
String iri,
IImmutablePredicatesObjects.Builder<IImmutablePredicatesObjects> thingBuilder) {
thingBuilders.push(thingBuilder);
this.iri = iri;
}

public NamespaceRepository getNamespaces() {
return nrb.build();
}

@Override
public void startPrefixMapping(String prefix, String uri) {
nrb.store(prefix, uri);
}

@Override
public void endPrefixMapping(String prefix) {
// Ignore.
}

private String iri(String uri, String localName, String qName) {
if (Strings.isNullOrEmpty(uri)) uri = iri;
if (Strings.isNullOrEmpty(localName)) throw new IllegalStateException(uri + " " + qName);
return uri + "/" + localName;
}

@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
// TODO Remove this System.out.println again later!
System.out.println(
"startElement: uri=" + uri + ", localName=" + localName + ", qName=" + qName);
if (Strings.isNullOrEmpty(qName)) throw new IllegalStateException(localName);
if (!qName.equals(previousElementQName)) {
// Nested element
currentListBuilder = ImmutableList.builder();
} else {
// Same level element
}
previousElementQName = qName;

var nested = ImmutablePredicatesObjects.builder();
thingBuilders.add(nested);

// TODO Attributes, with #!
}

@Override
public void endElement(String uri, String localName, String qName) {
// TODO Remove this System.out.println again later!
System.out.println(
"endElement: uri=" + uri + ", localName=" + localName + ", qName=" + qName);
if (Strings.isNullOrEmpty(qName)) throw new IllegalStateException(localName);
if (!qName.equals(previousElementQName)) {
// Nested element
if (currentListBuilder == null) throw new IllegalStateException();
currentListBuilder.add(thingBuilders.removeLast().build());
var list = currentListBuilder.build();
// TODO rm? builders.peek().set(iri(uri, localName, qName), list);
var nested = thingBuilders.removeLast().set(iri(uri, localName, qName), list);
thingBuilders.peek().set(iri(uri, localName, qName), nested.build());
currentListBuilder = null;

} else {
// Same level element
// NOOP.
}
}

@Override
public void characters(char[] ch, int start, int length) {
// TODO set https://enola.dev/content-as-text ?
}
}
Loading

0 comments on commit 2f25ea9

Please sign in to comment.