diff --git a/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java b/metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java similarity index 69% rename from metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java rename to metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java index fdcecbfcb..19acdc868 100644 --- a/metafacture-html/src/main/java/org/metafacture/html/ScriptExtractor.java +++ b/metafacture-html/src/main/java/org/metafacture/html/ElementExtractor.java @@ -30,21 +30,30 @@ import org.metafacture.framework.helpers.DefaultObjectPipe; /** - * Extracts the first script from an HTML document + * Extracts the the specified element from an HTML document * * @author Fabian Steeg */ -@Description("Extracts the first script from an HTML document") +@Description("Extracts the specified element from an HTML document") @In(Reader.class) @Out(String.class) -@FluxCommand("extract-script") -public class ScriptExtractor extends DefaultObjectPipe> { +@FluxCommand("extract-element") +public class ElementExtractor extends DefaultObjectPipe> { + private String selector; + + /** + * @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax + */ + public ElementExtractor(final String selector) { + this.selector = selector; + } + @Override public void process(final Reader reader) { try { Document document = Jsoup.parse(IOUtils.toString(reader)); - Element firstScript = document.select("script").first(); - getReceiver().process(firstScript.data()); + Element firstElement = document.select(selector).first(); + getReceiver().process(firstElement.data()); } catch (IOException e) { e.printStackTrace(); } diff --git a/metafacture-html/src/main/resources/flux-commands.properties b/metafacture-html/src/main/resources/flux-commands.properties index 95f4e031c..e6046cd3f 100644 --- a/metafacture-html/src/main/resources/flux-commands.properties +++ b/metafacture-html/src/main/resources/flux-commands.properties @@ -14,4 +14,4 @@ # limitations under the License. # decode-html org.metafacture.html.HtmlDecoder -extract-script org.metafacture.html.ScriptExtractor +extract-element org.metafacture.html.ElementExtractor diff --git a/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java b/metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java similarity index 74% rename from metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java rename to metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java index 6045fad7a..860af9f34 100644 --- a/metafacture-html/src/test/java/org/metafacture/html/ScriptExtractorTest.java +++ b/metafacture-html/src/test/java/org/metafacture/html/ElementExtractorTest.java @@ -28,17 +28,20 @@ import org.mockito.MockitoAnnotations; /** - * Tests for {@link ScriptExtractor}. + * Tests for {@link ElementExtractor}. * * @author Fabian Steeg * */ -public final class ScriptExtractorTest { +public final class ElementExtractorTest { - private static final StringReader IN = new StringReader("" + + "