From 442f447605b5170eb62a27870b1446fffa39b7c4 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 11 Nov 2020 09:46:14 +0100 Subject: [PATCH 1/2] Add extract-element flux command based on CSS-style selector See: https://github.com/metafacture/metafacture-core/issues/312 https://gitlab.com/oersi/oersi-etl/-/issues/27 https://jsoup.org/cookbook/extracting-data/selector-syntax --- .../metafacture/html/ElementExtractor.java | 61 +++++++++++++++++ .../main/resources/flux-commands.properties | 1 + .../html/ElementExtractorTest.java | 67 +++++++++++++++++++ 3 files changed, 129 insertions(+) create mode 100644 metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java create mode 100644 metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java diff --git a/metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java b/metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java new file mode 100644 index 000000000..19acdc868 --- /dev/null +++ b/metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java @@ -0,0 +1,61 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.commons.io.IOUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultObjectPipe; + +/** + * Extracts the the specified element from an HTML document + * + * @author Fabian Steeg + */ +@Description("Extracts the specified element from an HTML document") +@In(Reader.class) +@Out(String.class) +@FluxCommand("extract-element") +public class ElementExtractor extends DefaultObjectPipe> { + private String selector; + + /** + * @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax + */ + public ElementExtractor(final String selector) { + this.selector = selector; + } + + @Override + public void process(final Reader reader) { + try { + Document document = Jsoup.parse(IOUtils.toString(reader)); + Element firstElement = document.select(selector).first(); + getReceiver().process(firstElement.data()); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/metafacture-html/src/main/resources/flux-commands.properties b/metafacture-html/src/main/resources/flux-commands.properties index 95f4e031c..3d1443da4 100644 --- a/metafacture-html/src/main/resources/flux-commands.properties +++ b/metafacture-html/src/main/resources/flux-commands.properties @@ -15,3 +15,4 @@ # decode-html org.metafacture.html.HtmlDecoder extract-script org.metafacture.html.ScriptExtractor +extract-element org.metafacture.html.ElementExtractor diff --git a/metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java b/metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java new file mode 100644 index 000000000..860af9f34 --- /dev/null +++ b/metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java @@ -0,0 +1,67 @@ +/* + * Copyright 2020 Fabian Steeg, hbz + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.metafacture.html; + +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; + +import java.io.StringReader; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.metafacture.framework.ObjectReceiver; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +/** + * Tests for {@link ElementExtractor}. + * + * @author Fabian Steeg + * + */ +public final class ElementExtractorTest { + + private static final StringReader IN = new StringReader("" + + "" + + "