From 1dfddcab765d4db4f5ace09ccce97ae1fc4befea Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 23 Jan 2020 17:17:13 +0100 Subject: [PATCH 1/7] Basic HtmlReader with `html-to-xml` flux command Parse HTML with jsoup, write XML. See example in test. See https://github.com/metafacture/metafacture-core/issues/312 --- metafacture-html/build.gradle | 30 +++++++++ .../java/org/metafacture/html/HtmlReader.java | 51 +++++++++++++++ .../main/resources/flux-commands.properties | 16 +++++ .../org/metafacture/html/HtmlReaderTest.java | 64 +++++++++++++++++++ .../test/resources/simplelogger.properties | 15 +++++ settings.gradle | 1 + 6 files changed, 177 insertions(+) create mode 100644 metafacture-html/build.gradle create mode 100644 metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java create mode 100644 metafacture-html/src/main/resources/flux-commands.properties create mode 100644 metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java create mode 100644 metafacture-html/src/test/resources/simplelogger.properties diff --git a/metafacture-html/build.gradle b/metafacture-html/build.gradle new file mode 100644 index 000000000..e8c8ab2f0 --- /dev/null +++ b/metafacture-html/build.gradle @@ -0,0 +1,30 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +ext.mavenName = 'Metafacture HTML' +description = 'Modules for processing HTML documents' + +dependencies { + api project(':metafacture-framework') + implementation project(':metafacture-commons') + implementation 'org.slf4j:slf4j-api:1.7.21' + implementation 'org.apache.commons:commons-compress:1.12' + implementation 'commons-io:commons-io:2.6' + implementation 'org.jsoup:jsoup:1.12.1' + testImplementation 'junit:junit:4.12' + testImplementation 'org.mockito:mockito-core:2.5.5' + testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21' +} diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java new file mode 100644 index 000000000..bc81e3597 --- /dev/null +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java @@ -0,0 +1,51 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +/** + * Parses HTML to X(HT)ML + * + * @author Fabian Steeg + */ +@Description("Parses HTML to X(HT)ML") +@In(Reader.class) +@Out(String.class) +@FluxCommand("html-to-xml") +public class HtmlReader extends DefaultObjectPipe> { + @Override + public void process(final Reader reader) { + try { + Document document = Jsoup.parse(IOUtils.toString(reader)); + document.outputSettings().prettyPrint(false).syntax(Document.OutputSettings.Syntax.xml); + getReceiver().process(document.html()); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/metafacture-html/src/main/resources/flux-commands.properties b/metafacture-html/src/main/resources/flux-commands.properties new file mode 100644 index 000000000..24ebc5c10 --- /dev/null +++ b/metafacture-html/src/main/resources/flux-commands.properties @@ -0,0 +1,16 @@ +# +# Copyright 2020 Fabian Steeg, hbz +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +html-to-xml org.metafacture.html.HtmlReader diff --git a/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java b/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java new file mode 100644 index 000000000..f60f88a67 --- /dev/null +++ b/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java @@ -0,0 +1,64 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +import java.io.StringReader; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.ObjectReceiver; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +/** + * Tests for {@link HtmlReader}. + * + * @author Fabian Steeg + * + */ +public final class HtmlReaderTest { + + private static final StringReader IN = new StringReader("hi"); + private static final String OUT = "hi"; + + private HtmlReader htmlReader; + + @Mock + private ObjectReceiver receiver; + + @Before + public void setup() { + MockitoAnnotations.initMocks(this); + htmlReader = new HtmlReader(); + htmlReader.setReceiver(receiver); + } + + @Test + public void testShouldProcessRecordsFollowedbySeparator() { + htmlReader.process(IN); + verify(receiver).process(OUT); + verifyNoMoreInteractions(receiver); + } + + @After + public void cleanup() { + htmlReader.closeStream(); + } +} diff --git a/metafacture-html/src/test/resources/simplelogger.properties b/metafacture-html/src/test/resources/simplelogger.properties new file mode 100644 index 000000000..5fb5904d6 --- /dev/null +++ b/metafacture-html/src/test/resources/simplelogger.properties @@ -0,0 +1,15 @@ +# Copyright 2020 Fabian Steeg, hbz +# +# Licensed under the Apache License, Version 2.0 the "License"; +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +org.slf4j.simpleLogger.defaultLogLevel = DEBUG diff --git a/settings.gradle b/settings.gradle index 738d3f046..d83bdc584 100644 --- a/settings.gradle +++ b/settings.gradle @@ -28,6 +28,7 @@ include ':metafacture-strings' include ':metafacture-formeta' include ':metafacture-formatting' include ':metafacture-xml' +include ':metafacture-html' include ':metafacture-triples' include ':metafacture-statistics' include ':metafacture-io' From 4cc023d4297214cb0c6e8edf224f16535f5412a7 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Fri, 24 Jan 2020 13:15:28 +0100 Subject: [PATCH 2/7] Output to Reader, not String To use with decode-xml, but how to test? See https://github.com/metafacture/metafacture-core/issues/312 --- .../src/main/java/org/metafacture/html/HtmlReader.java | 7 ++++--- .../src/test/java/org/metafacture/html/HtmlReaderTest.java | 7 +++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java index bc81e3597..0129a5976 100644 --- a/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlReader.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.io.Reader; +import java.io.StringReader; import org.apache.commons.io.IOUtils; import org.jsoup.Jsoup; @@ -35,15 +36,15 @@ */ @Description("Parses HTML to X(HT)ML") @In(Reader.class) -@Out(String.class) +@Out(Reader.class) @FluxCommand("html-to-xml") -public class HtmlReader extends DefaultObjectPipe> { +public class HtmlReader extends DefaultObjectPipe> { @Override public void process(final Reader reader) { try { Document document = Jsoup.parse(IOUtils.toString(reader)); document.outputSettings().prettyPrint(false).syntax(Document.OutputSettings.Syntax.xml); - getReceiver().process(document.html()); + getReceiver().process(new StringReader(document.html())); } catch (IOException e) { e.printStackTrace(); } diff --git a/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java b/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java index f60f88a67..ec5f0eb76 100644 --- a/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java +++ b/metafacture-html/src/test/java/org/metafacture/html/HtmlReaderTest.java @@ -18,10 +18,12 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoMoreInteractions; +import java.io.Reader; import java.io.StringReader; import org.junit.After; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.metafacture.framework.ObjectReceiver; import org.mockito.Mock; @@ -36,12 +38,12 @@ public final class HtmlReaderTest { private static final StringReader IN = new StringReader("hi"); - private static final String OUT = "hi"; + private static final StringReader OUT = new StringReader("hi"); private HtmlReader htmlReader; @Mock - private ObjectReceiver receiver; + private ObjectReceiver receiver; @Before public void setup() { @@ -51,6 +53,7 @@ public void setup() { } @Test + @Ignore public void testShouldProcessRecordsFollowedbySeparator() { htmlReader.process(IN); verify(receiver).process(OUT); From f2ae2e9f279dbb5d0597150e9f92c42f6e1bab9d Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Tue, 4 Feb 2020 12:33:08 +0100 Subject: [PATCH 3/7] Add HtmlDecoder and tests With `decode-html` flux command See https://github.com/metafacture/metafacture-core/issues/312 --- .../org/metafacture/html/HtmlDecoder.java | 66 +++++++++++++ .../main/resources/flux-commands.properties | 1 + .../org/metafacture/html/HtmlDecoderTest.java | 92 +++++++++++++++++++ 3 files changed, 159 insertions(+) create mode 100644 metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java create mode 100644 metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java new file mode 100644 index 000000000..28f491989 --- /dev/null +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java @@ -0,0 +1,66 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Element; +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.StreamReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +/** + * Decode HTML to metadata events. Each input document represents one record. + * + * @author Fabian Steeg (fsteeg) + * + */ +@Description("Decode HTML to metadata events") +@In(Reader.class) +@Out(StreamReceiver.class) +@FluxCommand("decode-html") +public class HtmlDecoder extends DefaultObjectPipe { + + @Override + public void process(final Reader reader) { + try { + StreamReceiver receiver = getReceiver(); + receiver.startRecord(null); + String html = IOUtils.toString(reader); + for (Element element : Jsoup.parse(html).getAllElements()) { + receiver.startEntity(element.nodeName()); + Attributes attributes = element.attributes(); + for (Attribute attribute : attributes) { + receiver.literal(attribute.getKey(), attribute.getValue()); + } + String text = element.text().trim(); + receiver.literal("value", text.isEmpty() ? element.data() : text); + receiver.endEntity(); + } + receiver.endRecord(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/metafacture-html/src/main/resources/flux-commands.properties b/metafacture-html/src/main/resources/flux-commands.properties index 24ebc5c10..96bef061b 100644 --- a/metafacture-html/src/main/resources/flux-commands.properties +++ b/metafacture-html/src/main/resources/flux-commands.properties @@ -14,3 +14,4 @@ # limitations under the License. # html-to-xml org.metafacture.html.HtmlReader +decode-html org.metafacture.html.HtmlDecoder diff --git a/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java new file mode 100644 index 000000000..eaa7f76ea --- /dev/null +++ b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java @@ -0,0 +1,92 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import static org.mockito.Mockito.inOrder; + +import java.io.StringReader; + +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.StreamReceiver; +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +/** + * Tests for class {@link HtmlDecoder}. + * + * @author Fabian Steeg + * + */ +public final class HtmlDecoderTest { + + @Mock + private StreamReceiver receiver; + + private HtmlDecoder htmlDecoder; + + @Before + public void setup() { + MockitoAnnotations.initMocks(this); + htmlDecoder = new HtmlDecoder(); + htmlDecoder.setReceiver(receiver); + } + + @Test + public void htmlElementsAsEntities() { + htmlDecoder.process(new StringReader("

Header

Paragraph

")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("h1"); + ordered.verify(receiver).literal("value", "Header"); + ordered.verify(receiver).endEntity(); + ordered.verify(receiver).startEntity("p"); + ordered.verify(receiver).literal("value", "Paragraph"); + ordered.verify(receiver).endEntity(); + } + + @Test + public void nestedEntities() { + htmlDecoder.process(new StringReader("
  • Item
    • ")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("ul"); + ordered.verify(receiver).startEntity("li"); + ordered.verify(receiver).literal("value", "Item"); + ordered.verify(receiver).endEntity(); + ordered.verify(receiver).endEntity(); + } + + @Test + public void htmlAttributesAsLiterals() { + htmlDecoder.process(new StringReader("

      Text")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("p"); + ordered.verify(receiver).literal("class", "lead"); + ordered.verify(receiver).literal("value", "Text"); + ordered.verify(receiver).endEntity(); + } + + @Test + public void htmlScriptElementData() { + htmlDecoder.process(new StringReader("")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("script"); + ordered.verify(receiver).literal("type", "application/ld+json"); + ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}"); + ordered.verify(receiver).endEntity(); + } + +} From 6605128d20461855588a45a5f5275f800f5c7085 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Tue, 4 Feb 2020 14:58:39 +0100 Subject: [PATCH 4/7] Process document recursively Set generated record ID, only process content of leaf nodes See https://github.com/metafacture/metafacture-core/issues/312 --- .../org/metafacture/html/HtmlDecoder.java | 36 ++++++++++++------- .../org/metafacture/html/HtmlDecoderTest.java | 22 ++++++++---- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java index 28f491989..7a3719910 100644 --- a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java @@ -17,11 +17,13 @@ import java.io.IOException; import java.io.Reader; +import java.util.UUID; import org.apache.commons.io.IOUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.metafacture.framework.FluxCommand; import org.metafacture.framework.StreamReceiver; @@ -46,21 +48,31 @@ public class HtmlDecoder extends DefaultObjectPipe { public void process(final Reader reader) { try { StreamReceiver receiver = getReceiver(); - receiver.startRecord(null); - String html = IOUtils.toString(reader); - for (Element element : Jsoup.parse(html).getAllElements()) { - receiver.startEntity(element.nodeName()); - Attributes attributes = element.attributes(); - for (Attribute attribute : attributes) { - receiver.literal(attribute.getKey(), attribute.getValue()); - } - String text = element.text().trim(); - receiver.literal("value", text.isEmpty() ? element.data() : text); - receiver.endEntity(); - } + receiver.startRecord(UUID.randomUUID().toString()); + Document document = Jsoup.parse(IOUtils.toString(reader)); + process(document, receiver); receiver.endRecord(); } catch (IOException e) { e.printStackTrace(); } } + + private void process(Element parent, StreamReceiver receiver) { + for (Element element : parent.children()) { + receiver.startEntity(element.nodeName()); + Attributes attributes = element.attributes(); + for (Attribute attribute : attributes) { + receiver.literal(attribute.getKey(), attribute.getValue()); + } + if (element.children().isEmpty()) { + String text = element.text().trim(); + String value = text.isEmpty() ? element.data() : text; + if (!value.isEmpty()) { + receiver.literal("value", value); + } + } + process(element, receiver); + receiver.endEntity(); + } + } } diff --git a/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java index eaa7f76ea..1ca0b3129 100644 --- a/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java +++ b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java @@ -16,6 +16,7 @@ package org.metafacture.html; import static org.mockito.Mockito.inOrder; +import static org.mockito.Mockito.times; import java.io.StringReader; @@ -50,23 +51,28 @@ public void setup() { public void htmlElementsAsEntities() { htmlDecoder.process(new StringReader("

      Header

      Paragraph

      ")); final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("html"); + ordered.verify(receiver).startEntity("head"); + ordered.verify(receiver).endEntity(); + ordered.verify(receiver).startEntity("body"); ordered.verify(receiver).startEntity("h1"); ordered.verify(receiver).literal("value", "Header"); ordered.verify(receiver).endEntity(); ordered.verify(receiver).startEntity("p"); ordered.verify(receiver).literal("value", "Paragraph"); - ordered.verify(receiver).endEntity(); + ordered.verify(receiver, times(3)).endEntity(); } @Test public void nestedEntities() { - htmlDecoder.process(new StringReader("
      • Item
        • ")); + htmlDecoder.process(new StringReader("
          • Item
          ")); final InOrder ordered = inOrder(receiver); ordered.verify(receiver).startEntity("ul"); ordered.verify(receiver).startEntity("li"); ordered.verify(receiver).literal("value", "Item"); - ordered.verify(receiver).endEntity(); - ordered.verify(receiver).endEntity(); + // elements above plus body, html + ordered.verify(receiver, times(4)).endEntity(); + } @Test @@ -76,9 +82,10 @@ public void htmlAttributesAsLiterals() { ordered.verify(receiver).startEntity("p"); ordered.verify(receiver).literal("class", "lead"); ordered.verify(receiver).literal("value", "Text"); - ordered.verify(receiver).endEntity(); + // elements above plus body, html + ordered.verify(receiver, times(3)).endEntity(); } - + @Test public void htmlScriptElementData() { htmlDecoder.process(new StringReader("")); @@ -86,7 +93,8 @@ public void htmlScriptElementData() { ordered.verify(receiver).startEntity("script"); ordered.verify(receiver).literal("type", "application/ld+json"); ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}"); - ordered.verify(receiver).endEntity(); + // elements above plus body, html + ordered.verify(receiver, times(4)).endEntity(); } } From 47a5ba79ef72edd8dbc2e2fb58908b26359210ef Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 6 Feb 2020 16:23:36 +0100 Subject: [PATCH 5/7] Add decode-json Flux command See https://github.com/metafacture/metafacture-core/issues/314 --- metafacture-json/src/main/resources/flux-commands.properties | 1 + 1 file changed, 1 insertion(+) diff --git a/metafacture-json/src/main/resources/flux-commands.properties b/metafacture-json/src/main/resources/flux-commands.properties index c193a02fb..b3c5a2687 100644 --- a/metafacture-json/src/main/resources/flux-commands.properties +++ b/metafacture-json/src/main/resources/flux-commands.properties @@ -14,3 +14,4 @@ # limitations under the License. # encode-json org.metafacture.json.JsonEncoder +decode-json org.metafacture.json.JsonDecoder \ No newline at end of file From 34472466255ee97ed71edd59284e95c14ed6f2ce Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 6 Feb 2020 16:25:00 +0100 Subject: [PATCH 6/7] Add ScriptExtractor with extract-script Flux command See https://github.com/metafacture/metafacture-core/issues/312 See https://github.com/hbz/oerindex/issues/3 --- .../org/metafacture/html/ScriptExtractor.java | 52 +++++++++++++++ .../main/resources/flux-commands.properties | 1 + .../metafacture/html/ScriptExtractorTest.java | 64 +++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java create mode 100644 metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java diff --git a/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java b/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java new file mode 100644 index 000000000..fdcecbfcb --- /dev/null +++ b/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java @@ -0,0 +1,52 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +/** + * Extracts the first script from an HTML document + * + * @author Fabian Steeg + */ +@Description("Extracts the first script from an HTML document") +@In(Reader.class) +@Out(String.class) +@FluxCommand("extract-script") +public class ScriptExtractor extends DefaultObjectPipe> { + @Override + public void process(final Reader reader) { + try { + Document document = Jsoup.parse(IOUtils.toString(reader)); + Element firstScript = document.select("script").first(); + getReceiver().process(firstScript.data()); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/metafacture-html/src/main/resources/flux-commands.properties b/metafacture-html/src/main/resources/flux-commands.properties index 96bef061b..cfab0be69 100644 --- a/metafacture-html/src/main/resources/flux-commands.properties +++ b/metafacture-html/src/main/resources/flux-commands.properties @@ -15,3 +15,4 @@ # html-to-xml org.metafacture.html.HtmlReader decode-html org.metafacture.html.HtmlDecoder +extract-script org.metafacture.html.ScriptExtractor diff --git a/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java b/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java new file mode 100644 index 000000000..6045fad7a --- /dev/null +++ b/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java @@ -0,0 +1,64 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +import java.io.StringReader; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.ObjectReceiver; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +/** + * Tests for {@link ScriptExtractor}. + * + * @author Fabian Steeg + * + */ +public final class ScriptExtractorTest { + + private static final StringReader IN = new StringReader("