Skip to content

Commit

Permalink
Getting ready for boilerpipe 2.0
Browse files Browse the repository at this point in the history
Moved from Ant to Maven.
Automatic code formatting (mvn java-formatter:format)
Automatic license headers (mvn license:format)
Move NekoHTML into private package to prevent dependency collisions.
Some cosmetic changes (javadocs).
New package name.
  • Loading branch information
kohlschuetter committed Dec 1, 2014
1 parent 8db67cf commit c10c4e0
Show file tree
Hide file tree
Showing 661 changed files with 6,925 additions and 120,355 deletions.
10 changes: 0 additions & 10 deletions .classpath

This file was deleted.

6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.project
.settings
.classpath
target
.DS_Store
dependency-reduced-pom.xml
17 changes: 0 additions & 17 deletions .project

This file was deleted.

2 changes: 0 additions & 2 deletions INSTALL.txt

This file was deleted.

15 changes: 15 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
boilerpipe

Copyright (c) 2009, 2014 Christian Kohlschütter

The author licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
18 changes: 0 additions & 18 deletions LICENSE.txt

This file was deleted.

15 changes: 15 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
boilerpipe

Copyright (c) 2009, 2014 Christian Kohlschütter

The author licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
24 changes: 0 additions & 24 deletions NOTICE.txt

This file was deleted.

6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@ boilerpipe
==========

Boilerplate Removal and Fulltext Extraction from HTML pages

NOTE: This is a work-in-progress transmit from Google Code.

The latest stable version of boilerpipe is available at [https://code.google.com/p/boilerpipe]().


35 changes: 35 additions & 0 deletions boilerpipe-common/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>boilerpipe-common</artifactId>
<packaging>jar</packaging>
<parent>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>boilerpipe-parent</artifactId>
<version>2.0-SNAPSHOT</version>
</parent>
<name>boilerpipe-common</name>
<properties>
<boilerpipe.parent.base.directory>${project.parent.basedir}</boilerpipe.parent.base.directory>
</properties>
<dependencies>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.9.1</version>
</dependency>
<!--
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.13</version>
</dependency>
-->
<dependency>
<groupId>com.kohlschutter.boilerpipe</groupId>
<artifactId>nekohtml-relocated</artifactId>
<version>1.9.13</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/**
* boilerpipe
*
* Copyright (c) 2009, 2014 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.kohlschutter.boilerpipe;

import com.kohlschutter.boilerpipe.document.TextDocument;

/**
* Something that can be represented as a {@link TextDocument}.
*/
public interface BoilerpipeDocumentSource {
TextDocument toTextDocument() throws BoilerpipeProcessingException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/**
* boilerpipe
*
* Copyright (c) 2009, 2014 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.kohlschutter.boilerpipe;

import java.io.Reader;

import org.xml.sax.InputSource;

import com.kohlschutter.boilerpipe.document.TextDocument;

/**
* Describes a complete filter pipeline.
*/
public interface BoilerpipeExtractor extends BoilerpipeFilter {
/**
* Extracts text from the HTML code given as a String.
*
* @param html The HTML code as a String.
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(final String html) throws BoilerpipeProcessingException;

/**
* Extracts text from the HTML code available from the given {@link InputSource}.
*
* @param is The InputSource containing the HTML
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(final InputSource is) throws BoilerpipeProcessingException;

/**
* Extracts text from the HTML code available from the given {@link Reader}.
*
* @param r The Reader containing the HTML
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(final Reader r) throws BoilerpipeProcessingException;

/**
* Extracts text from the given {@link TextDocument} object.
*
* @param doc The {@link TextDocument}.
* @return The extracted text.
* @throws BoilerpipeProcessingException
*/
public String getText(TextDocument doc) throws BoilerpipeProcessingException;
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* boilerpipe
*
* Copyright (c) 2009 Christian Kohlschütter
* Copyright (c) 2009, 2014 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
Expand All @@ -15,26 +15,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe;
package com.kohlschutter.boilerpipe;

import de.l3s.boilerpipe.document.TextDocument;
import com.kohlschutter.boilerpipe.document.TextDocument;

/**
* A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and
* processes it somehow.
*
* @author Christian Kohlschütter
* A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and processes it somehow.
*/
public interface BoilerpipeFilter {
/**
* Processes the given document <code>doc</code>.
*
* @param doc
* The {@link TextDocument} that is to be processed.
* @return <code>true</code> if changes have been made to the
* {@link TextDocument}.
* @throws BoilerpipeProcessingException
*/
boolean process(final TextDocument doc)
throws BoilerpipeProcessingException;
/**
* Processes the given document <code>doc</code>.
*
* @param doc The {@link TextDocument} that is to be processed.
* @return <code>true</code> if changes have been made to the {@link TextDocument}.
* @throws BoilerpipeProcessingException
*/
boolean process(final TextDocument doc) throws BoilerpipeProcessingException;
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* boilerpipe
*
* Copyright (c) 2009 Christian Kohlschütter
* Copyright (c) 2009, 2014 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
Expand All @@ -15,21 +15,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe;
package com.kohlschutter.boilerpipe;

import de.l3s.boilerpipe.document.TextDocument;
import com.kohlschutter.boilerpipe.document.TextDocument;

/**
* A source that returns {@link TextDocument}s.
*
* @author Christian Kohlschütter
*/
public interface BoilerpipeInput {
/**
* Returns (somehow) a {@link TextDocument}.
*
* @return A {@link TextDocument}.
* @throws BoilerpipeProcessingException
*/
TextDocument getTextDocument() throws BoilerpipeProcessingException;
/**
* Returns (somehow) a {@link TextDocument}.
*
* @return A {@link TextDocument}.
* @throws BoilerpipeProcessingException
*/
TextDocument getTextDocument() throws BoilerpipeProcessingException;
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* boilerpipe
*
* Copyright (c) 2009 Christian Kohlschütter
* Copyright (c) 2009, 2014 Christian Kohlschütter
*
* The author licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
Expand All @@ -15,29 +15,27 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.l3s.boilerpipe;
package com.kohlschutter.boilerpipe;

/**
* Exception for signaling failure in the processing pipeline.
*
* @author Christian Kohlschütter
*/
public class BoilerpipeProcessingException extends Exception {
private static final long serialVersionUID = 1L;
private static final long serialVersionUID = 1L;

public BoilerpipeProcessingException() {
super();
}
public BoilerpipeProcessingException() {
super();
}

public BoilerpipeProcessingException(String message, Throwable cause) {
super(message, cause);
}
public BoilerpipeProcessingException(String message, Throwable cause) {
super(message, cause);
}

public BoilerpipeProcessingException(String message) {
super(message);
}
public BoilerpipeProcessingException(String message) {
super(message);
}

public BoilerpipeProcessingException(Throwable cause) {
super(cause);
}
public BoilerpipeProcessingException(Throwable cause) {
super(cause);
}
}
Loading

0 comments on commit c10c4e0

Please sign in to comment.