Skip to content

Commit

Permalink
Merge pull request #313 from metafacture/312-html
Browse files Browse the repository at this point in the history
Add HTML input support
  • Loading branch information
fsteeg authored Feb 12, 2020
2 parents e0fdd23 + 3427a67 commit 3c9998c
Show file tree
Hide file tree
Showing 9 changed files with 358 additions and 0 deletions.
30 changes: 30 additions & 0 deletions metafacture-html/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

ext.mavenName = 'Metafacture HTML'
description = 'Modules for processing HTML documents'

dependencies {
api project(':metafacture-framework')
implementation project(':metafacture-commons')
implementation 'org.slf4j:slf4j-api:1.7.21'
implementation 'org.apache.commons:commons-compress:1.12'
implementation 'commons-io:commons-io:2.6'
implementation 'org.jsoup:jsoup:1.12.1'
testImplementation 'junit:junit:4.12'
testImplementation 'org.mockito:mockito-core:2.5.5'
testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21'
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import java.io.IOException;
import java.io.Reader;
import java.util.UUID;

import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.StreamReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;

/**
* Decode HTML to metadata events. Each input document represents one record.
*
* @author Fabian Steeg (fsteeg)
*
*/
@Description("Decode HTML to metadata events")
@In(Reader.class)
@Out(StreamReceiver.class)
@FluxCommand("decode-html")
public class HtmlDecoder extends DefaultObjectPipe<Reader, StreamReceiver> {

@Override
public void process(final Reader reader) {
try {
StreamReceiver receiver = getReceiver();
receiver.startRecord(UUID.randomUUID().toString());
Document document = Jsoup.parse(IOUtils.toString(reader));
process(document, receiver);
receiver.endRecord();
} catch (IOException e) {
e.printStackTrace();
}
}

private void process(Element parent, StreamReceiver receiver) {
for (Element element : parent.children()) {
receiver.startEntity(element.nodeName());
Attributes attributes = element.attributes();
for (Attribute attribute : attributes) {
receiver.literal(attribute.getKey(), attribute.getValue());
}
if (element.children().isEmpty()) {
String text = element.text().trim();
String value = text.isEmpty() ? element.data() : text;
if (!value.isEmpty()) {
receiver.literal("value", value);
}
}
process(element, receiver);
receiver.endEntity();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import java.io.IOException;
import java.io.Reader;

import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.ObjectReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;

/**
* Extracts the first script from an HTML document
*
* @author Fabian Steeg
*/
@Description("Extracts the first script from an HTML document")
@In(Reader.class)
@Out(String.class)
@FluxCommand("extract-script")
public class ScriptExtractor extends DefaultObjectPipe<Reader, ObjectReceiver<String>> {
@Override
public void process(final Reader reader) {
try {
Document document = Jsoup.parse(IOUtils.toString(reader));
Element firstScript = document.select("script").first();
getReceiver().process(firstScript.data());
} catch (IOException e) {
e.printStackTrace();
}
}
}
17 changes: 17 additions & 0 deletions metafacture-html/src/main/resources/flux-commands.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Copyright 2020 Fabian Steeg, hbz
#
# Licensed under the Apache License, Version 2.0 the "License";
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
decode-html org.metafacture.html.HtmlDecoder
extract-script org.metafacture.html.ScriptExtractor
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.times;

import java.io.StringReader;

import org.junit.Before;
import org.junit.Test;
import org.metafacture.framework.StreamReceiver;
import org.mockito.InOrder;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;

/**
* Tests for class {@link HtmlDecoder}.
*
* @author Fabian Steeg
*
*/
public final class HtmlDecoderTest {

@Mock
private StreamReceiver receiver;

private HtmlDecoder htmlDecoder;

@Before
public void setup() {
MockitoAnnotations.initMocks(this);
htmlDecoder = new HtmlDecoder();
htmlDecoder.setReceiver(receiver);
}

@Test
public void htmlElementsAsEntities() {
htmlDecoder.process(new StringReader("<h1>Header</h1><p>Paragraph</p>"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("html");
ordered.verify(receiver).startEntity("head");
ordered.verify(receiver).endEntity();
ordered.verify(receiver).startEntity("body");
ordered.verify(receiver).startEntity("h1");
ordered.verify(receiver).literal("value", "Header");
ordered.verify(receiver).endEntity();
ordered.verify(receiver).startEntity("p");
ordered.verify(receiver).literal("value", "Paragraph");
ordered.verify(receiver, times(3)).endEntity();
}

@Test
public void nestedEntities() {
htmlDecoder.process(new StringReader("<ul><li>Item</li></ul>"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("ul");
ordered.verify(receiver).startEntity("li");
ordered.verify(receiver).literal("value", "Item");
// elements above plus body, html
ordered.verify(receiver, times(4)).endEntity();

}

@Test
public void htmlAttributesAsLiterals() {
htmlDecoder.process(new StringReader("<p class=lead>Text"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("p");
ordered.verify(receiver).literal("class", "lead");
ordered.verify(receiver).literal("value", "Text");
// elements above plus body, html
ordered.verify(receiver, times(3)).endEntity();
}

@Test
public void htmlScriptElementData() {
htmlDecoder.process(new StringReader("<script type=application/ld+json>{\"id\":\"theId\"}</script>"));
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startEntity("script");
ordered.verify(receiver).literal("type", "application/ld+json");
ordered.verify(receiver).literal("value", "{\"id\":\"theId\"}");
// elements above plus body, html
ordered.verify(receiver, times(4)).endEntity();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2020 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.html;

import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoMoreInteractions;

import java.io.StringReader;

import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.metafacture.framework.ObjectReceiver;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;

/**
* Tests for {@link ScriptExtractor}.
*
* @author Fabian Steeg
*
*/
public final class ScriptExtractorTest {

private static final StringReader IN = new StringReader("<html><script>{\"code\":\"yo\"}");
private static final String OUT = "{\"code\":\"yo\"}";

private ScriptExtractor scriptExtractor;

@Mock
private ObjectReceiver<String> receiver;

@Before
public void setup() {
MockitoAnnotations.initMocks(this);
scriptExtractor = new ScriptExtractor();
scriptExtractor.setReceiver(receiver);
}

@Test
public void testShouldProcessRecordsFollowedbySeparator() {
scriptExtractor.process(IN);
verify(receiver).process(OUT);
verifyNoMoreInteractions(receiver);
}

@After
public void cleanup() {
scriptExtractor.closeStream();
}
}
15 changes: 15 additions & 0 deletions metafacture-html/src/test/resources/simplelogger.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2020 Fabian Steeg, hbz
#
# Licensed under the Apache License, Version 2.0 the "License";
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
org.slf4j.simpleLogger.defaultLogLevel = DEBUG
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
# limitations under the License.
#
encode-json org.metafacture.json.JsonEncoder
decode-json org.metafacture.json.JsonDecoder
1 change: 1 addition & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ include ':metafacture-strings'
include ':metafacture-formeta'
include ':metafacture-formatting'
include ':metafacture-xml'
include ':metafacture-html'
include ':metafacture-triples'
include ':metafacture-statistics'
include ':metafacture-io'
Expand Down

0 comments on commit 3c9998c

Please sign in to comment.