Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add HTML input support #313

Merged
merged 7 commits into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions metafacture-html/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

ext.mavenName = 'Metafacture HTML'
description = 'Modules for processing HTML documents'

dependencies {
api project(':metafacture-framework')
implementation project(':metafacture-commons')
implementation 'org.slf4j:slf4j-api:1.7.21'
implementation 'org.apache.commons:commons-compress:1.12'
implementation 'commons-io:commons-io:2.6'
implementation 'org.jsoup:jsoup:1.12.1'
testImplementation 'junit:junit:4.12'
testImplementation 'org.mockito:mockito-core:2.5.5'
testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21'
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import java.io.IOException;
import java.io.Reader;
import java.util.UUID;

import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.StreamReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;

/**
* Decode HTML to metadata events. Each input document represents one record.
*
* @author Fabian Steeg (fsteeg)
*
*/
@Description("Decode HTML to metadata events")
@In(Reader.class)
@Out(StreamReceiver.class)
@FluxCommand("decode-html")
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {

@Override
public void process(final Reader reader) {
try {
StreamReceiver receiver = getReceiver();
receiver.startRecord(UUID.randomUUID().toString());
Document document = Jsoup.parse(IOUtils.toString(reader));
process(document, receiver);
receiver.endRecord();
} catch (IOException e) {
e.printStackTrace();
}
}

private void process(Element parent, StreamReceiver receiver) {
for (Element element : parent.children()) {
receiver.startEntity(element.nodeName());
Attributes attributes = element.attributes();
for (Attribute attribute : attributes) {
receiver.literal(attribute.getKey(), attribute.getValue());
}
if (element.children().isEmpty()) {
String text = element.text().trim();
String value = text.isEmpty() ? element.data() : text;
if (!value.isEmpty()) {
receiver.literal("value", value);
}
}
process(element, receiver);
receiver.endEntity();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import java.io.IOException;
import java.io.Reader;

import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.ObjectReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;

/**
* Extracts the first script from an HTML document
*
* @author Fabian Steeg
*/
@Description("Extracts the first script from an HTML document")
@In(Reader.class)
@Out(String.class)
@FluxCommand("extract-script")
public class ScriptExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
@Override
public void process(final Reader reader) {
try {
Document document = Jsoup.parse(IOUtils.toString(reader));
Element firstScript = document.select("script").first();
getReceiver().process(firstScript.data());
} catch (IOException e) {
e.printStackTrace();
}
}
}
17 changes: 17 additions & 0 deletions metafacture-html/src/main/resources/flux-commands.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Copyright 2020 Fabian Steeg, hbz
#
# Licensed under the Apache License, Version 2.0 the "License";
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
decode-html org.metafacture.html.HtmlDecoder
extract-script org.metafacture.html.ScriptExtractor
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.times;

import java.io.StringReader;

import org.junit.Before;
import org.junit.Test;
import org.metafacture.framework.StreamReceiver;
import org.mockito.InOrder;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;

/**
* Tests for class {@link HtmlDecoder}.
*
* @author Fabian Steeg
*
*/
public final class HtmlDecoderTest {

@Mock
private StreamReceiver receiver;

private HtmlDecoder htmlDecoder;

@Before
public void setup() {
MockitoAnnotations.initMocks(this);
htmlDecoder = new HtmlDecoder();
htmlDecoder.setReceiver(receiver);
}

@Test
public void htmlElementsAsEntities() {
htmlDecoder.process(new StringReader("<h1>Header</h1><p>Paragraph</p>"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("html");
ordered.verify(receiver).startEntity("head");
ordered.verify(receiver).endEntity();
ordered.verify(receiver).startEntity("body");
ordered.verify(receiver).startEntity("h1");
ordered.verify(receiver).literal("value", "Header");
ordered.verify(receiver).endEntity();
ordered.verify(receiver).startEntity("p");
ordered.verify(receiver).literal("value", "Paragraph");
ordered.verify(receiver, times(3)).endEntity();
}

@Test
public void nestedEntities() {
htmlDecoder.process(new StringReader("<ul><li>Item</li></ul>"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("ul");
ordered.verify(receiver).startEntity("li");
ordered.verify(receiver).literal("value", "Item");
// elements above plus body, html
ordered.verify(receiver, times(4)).endEntity();

}

@Test
public void htmlAttributesAsLiterals() {
htmlDecoder.process(new StringReader("<p class=lead>Text"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("p");
ordered.verify(receiver).literal("class", "lead");
ordered.verify(receiver).literal("value", "Text");
// elements above plus body, html
ordered.verify(receiver, times(3)).endEntity();
}

@Test
public void htmlScriptElementData() {
htmlDecoder.process(new StringReader("<script type=application/ld+json>{\"id\":\"theId\"}</script>"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("script");
ordered.verify(receiver).literal("type", "application/ld+json");
ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}");
// elements above plus body, html
ordered.verify(receiver, times(4)).endEntity();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoMoreInteractions;

import java.io.StringReader;

import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.metafacture.framework.ObjectReceiver;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;

/**
* Tests for {@link ScriptExtractor}.
*
* @author Fabian Steeg
*
*/
public final class ScriptExtractorTest {

private static final StringReader IN = new StringReader("<html><script>{\"code\":\"yo\"}");
private static final String OUT = "{\"code\":\"yo\"}";

private ScriptExtractor scriptExtractor;

@Mock
private ObjectReceiver<String> receiver;

@Before
public void setup() {
MockitoAnnotations.initMocks(this);
scriptExtractor = new ScriptExtractor();
scriptExtractor.setReceiver(receiver);
}

@Test
public void testShouldProcessRecordsFollowedbySeparator() {
scriptExtractor.process(IN);
verify(receiver).process(OUT);
verifyNoMoreInteractions(receiver);
}

@After
public void cleanup() {
scriptExtractor.closeStream();
}
}
15 changes: 15 additions & 0 deletions metafacture-html/src/test/resources/simplelogger.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2020 Fabian Steeg, hbz
#
# Licensed under the Apache License, Version 2.0 the "License";
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
org.slf4j.simpleLogger.defaultLogLevel = DEBUG
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
# limitations under the License.
#
encode-json org.metafacture.json.JsonEncoder
decode-json org.metafacture.json.JsonDecoder
1 change: 1 addition & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ include ':metafacture-strings'
include ':metafacture-formeta'
include ':metafacture-formatting'
include ':metafacture-xml'
include ':metafacture-html'
include ':metafacture-triples'
include ':metafacture-statistics'
include ':metafacture-io'
Expand Down