Skip to content

Commit

Permalink
OPENNLP-1594 Add stricter tests for Summarizer component
Browse files Browse the repository at this point in the history
- adds further, stricter tests
- clarifies, at API level, the semantics and constraints of parameters
- separates tests so that each test class has a clear responsibility for its class under test
- removes binary model files from test/resources folder
- improves / enhances the JavaDoc further
  • Loading branch information
mawiesne committed Jul 19, 2024
1 parent 78af085 commit b130395
Show file tree
Hide file tree
Showing 48 changed files with 1,374 additions and 658 deletions.
10 changes: 9 additions & 1 deletion summarizer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,18 @@
<name>Apache OpenNLP Summarizer</name>

<properties>
<wordnet.version>2.4.0</wordnet.version>
<wordnet-dict.version>3.1</wordnet-dict.version>

<maven.download.plugin>1.9.0</maven.download.plugin>
</properties>
<repositories>
<repository>
<id>maven.aksw.org</id>
<url>https://maven.aksw.org/repository/internal/</url>
<releases/>
</repository>
</repositories>

<dependencies>
<dependency>
Expand All @@ -45,7 +53,7 @@
<dependency>
<groupId>edu.mit</groupId>
<artifactId>jwi</artifactId>
<version>2.2.3</version>
<version>${wordnet.version}</version>
</dependency>

<dependency>
Expand Down
14 changes: 11 additions & 3 deletions summarizer/src/main/java/opennlp/summarization/DocProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,20 @@
public interface DocProcessor {

/**
* Extracts sentences from a string representing an article.
* Extracts {@link Sentence sentences} from a string representing an article.
*
* @param text The text to process; if {@code null} or empty, an empty list is returned.
*
* @return The resulting list of detected {@link Sentence sentences}.
*/
List<Sentence> getSentencesFromStr(String text);
List<Sentence> getSentences(String text);

/**
* Parses out words from a specified {@link String sent}.
* Extracts words from a specified {@link String sent}.
*
* @param sent The sentence to process; if {@code null} or empty, an zero length array is returned.
*
* @return An array of tokens (words) contained in the given {@code sent}.
*/
String[] getWords(String sent);

Expand Down
8 changes: 4 additions & 4 deletions summarizer/src/main/java/opennlp/summarization/Score.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@
package opennlp.summarization;

/**
* Stores the score of a sentence for ranking sentences within a document.
* Encapsulates the score of a sentence for the purpose of ranking sentences within a document.
*/
public class Score implements Comparable<Score> {
private int sentId;
private double score;

public Score() {
score = 0;
public Score(int sentId, double score) {
this.sentId = sentId;
this.score = score;
}

public int getSentId() {
Expand All @@ -46,7 +47,6 @@ public void setScore(double score) {

@Override
public int compareTo(Score o) {

if (o.score > score) return 1;
else if (o.score < score) return -1;
return 0;
Expand Down
98 changes: 55 additions & 43 deletions summarizer/src/main/java/opennlp/summarization/Sentence.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,34 +32,44 @@
public class Sentence {

private static final String SPACE = " ";
private final List<Sentence> links;
private final List<Sentence> links = new ArrayList<>();
private final int sentId;

// sentId is always position of sentence in doc.
private int sentId;
private String stringVal;
private Score pageRankScore;
private int paragraph;
private int paraPos;
private boolean hasQuote;
private double wordWt = 0;
private int wordCnt;

public Sentence() {
links = new ArrayList<>();
}
private double wordWeight = 0;
private int wordCound = 0;

/**
* Instantiates a plain {@link Sentence} via a set of parameters.
*
* @param id A numeric identifier with a postive value.
* @param stringVal The string representation of the sentence.
* @param paragraph TODO clarify exact meaning of and constraints for this parameter.
* @param paraPos clarify exact meaning of and constraints for this parameter.
* @throws IllegalArgumentException Thrown if parameters are invalid.
*/
public Sentence(int id, String stringVal, int paragraph, int paraPos) {
if (id < 0) throw new IllegalArgumentException("Parameter 'id' cannot be negative");
if (stringVal == null || stringVal.isBlank())
throw new IllegalArgumentException("Parameter 'stringVal' must not be null");
if (paragraph < 0) throw new IllegalArgumentException("Parameter 'paragraph' cannot be negative");
if (paraPos < 0) throw new IllegalArgumentException("Parameter 'paraPos' cannot be negative");

public Sentence(int id) {
this();
this.sentId = id;
}
setParagraph(paragraph);
setStringVal(stringVal);
setParaPos(paraPos);
};

public int getSentId() {
return sentId;
}

public void setSentId(int sentId) {
this.sentId = sentId;
}

public Score getPageRankScore() {
return pageRankScore;
}
Expand All @@ -84,7 +94,7 @@ public void setParaPos(int paraPos) {
this.paraPos = paraPos;
}

private int calcWrdCnt(String stringVal2) {
private int calcWordCount(String stringVal2) {
int ret = 0;
StopWords sw = StopWords.getInstance();
String[] wrds = stringVal.split("\\s+");
Expand All @@ -102,7 +112,7 @@ public String getStringVal() {
public void setStringVal(String stringVal) {
this.stringVal = stringVal;
if (stringVal.contains("\"")) this.hasQuote = true;
this.wordCnt = calcWrdCnt(stringVal);
this.wordCound = calcWordCount(stringVal);
}

public void addLink(Sentence s) {
Expand All @@ -113,38 +123,21 @@ public List<Sentence> getLinks() {
return this.links;
}

public double getWordWt() {
return wordWt;
public double getWordWeight() {
return wordWeight;
}

public void setWordWt(double wordWt) {
this.wordWt = wordWt;
}

public int getWordCnt() {
return wordCnt == 0 ? this.getStringVal().split("\\s+").length : wordCnt;
}

// Should add an article id to the sentence class. For now returns true if the ids are the same.

@Override
public final boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Sentence sentence)) return false;

return sentId == sentence.sentId;
public void setWordWeight(double wordWt) {
this.wordWeight = wordWt;
}

@Override
public int hashCode() {
return Objects.hash(sentId);
}

@Override
public String toString() {
return this.stringVal;//+ "("+ this.paragraph +", "+this.paraPos+")";
public int getWordCount() {
return wordCound;
}

/**
* @return Applies stemming to each word and returns a fully-stemmed representation of a sentence.
*/
public String stem() {
PorterStemmer stemmer = new PorterStemmer();
StopWords sw = StopWords.getInstance();
Expand All @@ -167,4 +160,23 @@ public String stem() {
}
return b.toString();
}

// Should add an article id to the sentence class. For now returns true if the ids are the same.
@Override
public final boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Sentence sentence)) return false;

return sentId == sentence.sentId;
}

@Override
public int hashCode() {
return Objects.hash(sentId);
}

@Override
public String toString() {
return this.stringVal; // + "("+ this.paragraph +", "+this.paraPos+")";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,18 @@

package opennlp.summarization;

/**
* Describes the API of a component which summarizes the content of news, articles or books.
*/
public interface Summarizer {

/**
* Summarizes a given {@code article}. The length of the summary is
* Summarizes a given {@code text}. The length of the summary is
* influenced by the specified {@code maxWords} parameter.
*
* @param article The text to summarize. Must not be {@code null} and not be blank.
* @param text The content to summarize. Must not be {@code null} and not be blank.
* @param maxWords The maximum number of words. Must be larger than {@code zero}.
* @return The summary or an {@code empty} String if no summary could be derived.
*/
String summarize(String article, int maxWords);
String summarize(String text, int maxWords);
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -22,42 +22,49 @@

import opennlp.summarization.Sentence;

/**
* Represents a lexical chain.
*/
public class LexicalChain implements Comparable<LexicalChain> {
final List<Word> word;
final List<Sentence> sentences;

int start, last;
int score;
private final List<Word> words = new ArrayList<>();
private final List<Sentence> sentences = new ArrayList<>();
private int score;

int start;
int last;
int occurrences = 1;

public LexicalChain() {
word = new ArrayList<>();
sentences = new ArrayList<>();
}

public LexicalChain(int start) {
this.start = start;
}

public double score() {
return length(); //* homogeneity();
}

public int length() {
return word.size();
return words.size();
}

public float homogeneity() {
return (1.0f - (float) occurrences / (float) length());
}

public void addWord(Word w) {
word.add(w);
words.add(w);
}

public void addSentence(Sentence sent) {
if (!sentences.contains(sent))
sentences.add(sent);
}

public List<Word> getWord() {
return word;
public List<Word> getWords() {
return words;
}

public List<Sentence> getSentences() {
Expand Down
Loading

0 comments on commit b130395

Please sign in to comment.