diff --git a/metafacture-html/build.gradle b/metafacture-html/build.gradle new file mode 100644 index 000000000..e8c8ab2f0 --- /dev/null +++ b/metafacture-html/build.gradle @@ -0,0 +1,30 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +ext.mavenName = 'Metafacture HTML' +description = 'Modules for processing HTML documents' + +dependencies { + api project(':metafacture-framework') + implementation project(':metafacture-commons') + implementation 'org.slf4j:slf4j-api:1.7.21' + implementation 'org.apache.commons:commons-compress:1.12' + implementation 'commons-io:commons-io:2.6' + implementation 'org.jsoup:jsoup:1.12.1' + testImplementation 'junit:junit:4.12' + testImplementation 'org.mockito:mockito-core:2.5.5' + testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21' +} diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java new file mode 100644 index 000000000..7a3719910 --- /dev/null +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java @@ -0,0 +1,78 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import java.io.IOException; +import java.io.Reader; +import java.util.UUID; + +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.StreamReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +/** + * Decode HTML to metadata events. Each input document represents one record. + * + * @author Fabian Steeg (fsteeg) + * + */ +@Description("Decode HTML to metadata events") +@In(Reader.class) +@Out(StreamReceiver.class) +@FluxCommand("decode-html") +public class HtmlDecoder extends DefaultObjectPipe { + + @Override + public void process(final Reader reader) { + try { + StreamReceiver receiver = getReceiver(); + receiver.startRecord(UUID.randomUUID().toString()); + Document document = Jsoup.parse(IOUtils.toString(reader)); + process(document, receiver); + receiver.endRecord(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private void process(Element parent, StreamReceiver receiver) { + for (Element element : parent.children()) { + receiver.startEntity(element.nodeName()); + Attributes attributes = element.attributes(); + for (Attribute attribute : attributes) { + receiver.literal(attribute.getKey(), attribute.getValue()); + } + if (element.children().isEmpty()) { + String text = element.text().trim(); + String value = text.isEmpty() ? element.data() : text; + if (!value.isEmpty()) { + receiver.literal("value", value); + } + } + process(element, receiver); + receiver.endEntity(); + } + } +} diff --git a/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java b/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java new file mode 100644 index 000000000..fdcecbfcb --- /dev/null +++ b/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java @@ -0,0 +1,52 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +/** + * Extracts the first script from an HTML document + * + * @author Fabian Steeg + */ +@Description("Extracts the first script from an HTML document") +@In(Reader.class) +@Out(String.class) +@FluxCommand("extract-script") +public class ScriptExtractor extends DefaultObjectPipe> { + @Override + public void process(final Reader reader) { + try { + Document document = Jsoup.parse(IOUtils.toString(reader)); + Element firstScript = document.select("script").first(); + getReceiver().process(firstScript.data()); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/metafacture-html/src/main/resources/flux-commands.properties b/metafacture-html/src/main/resources/flux-commands.properties new file mode 100644 index 000000000..95f4e031c --- /dev/null +++ b/metafacture-html/src/main/resources/flux-commands.properties @@ -0,0 +1,17 @@ +# +# Copyright 2020 Fabian Steeg, hbz +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +decode-html org.metafacture.html.HtmlDecoder +extract-script org.metafacture.html.ScriptExtractor diff --git a/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java new file mode 100644 index 000000000..1ca0b3129 --- /dev/null +++ b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java @@ -0,0 +1,100 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.times; + +import java.io.StringReader; + +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.StreamReceiver; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +/** + * Tests for class {@link HtmlDecoder}. + * + * @author Fabian Steeg + * + */ +public final class HtmlDecoderTest { + + @Mock + private StreamReceiver receiver; + + private HtmlDecoder htmlDecoder; + + @Before + public void setup() { + MockitoAnnotations.initMocks(this); + htmlDecoder = new HtmlDecoder(); + htmlDecoder.setReceiver(receiver); + } + + @Test + public void htmlElementsAsEntities() { + htmlDecoder.process(new StringReader("

Header

Paragraph

")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("html"); + ordered.verify(receiver).startEntity("head"); + ordered.verify(receiver).endEntity(); + ordered.verify(receiver).startEntity("body"); + ordered.verify(receiver).startEntity("h1"); + ordered.verify(receiver).literal("value", "Header"); + ordered.verify(receiver).endEntity(); + ordered.verify(receiver).startEntity("p"); + ordered.verify(receiver).literal("value", "Paragraph"); + ordered.verify(receiver, times(3)).endEntity(); + } + + @Test + public void nestedEntities() { + htmlDecoder.process(new StringReader("")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("ul"); + ordered.verify(receiver).startEntity("li"); + ordered.verify(receiver).literal("value", "Item"); + // elements above plus body, html + ordered.verify(receiver, times(4)).endEntity(); + + } + + @Test + public void htmlAttributesAsLiterals() { + htmlDecoder.process(new StringReader("

Text")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("p"); + ordered.verify(receiver).literal("class", "lead"); + ordered.verify(receiver).literal("value", "Text"); + // elements above plus body, html + ordered.verify(receiver, times(3)).endEntity(); + } + + @Test + public void htmlScriptElementData() { + htmlDecoder.process(new StringReader("")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("script"); + ordered.verify(receiver).literal("type", "application/ld+json"); + ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}"); + // elements above plus body, html + ordered.verify(receiver, times(4)).endEntity(); + } + +} diff --git a/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java b/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java new file mode 100644 index 000000000..6045fad7a --- /dev/null +++ b/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java @@ -0,0 +1,64 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +import java.io.StringReader; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.ObjectReceiver; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +/** + * Tests for {@link ScriptExtractor}. + * + * @author Fabian Steeg + * + */ +public final class ScriptExtractorTest { + + private static final StringReader IN = new StringReader("