Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPENNLP-1679: Extend JavaDoc of SgmlParser #719

Merged
merged 1 commit into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ else if (startDocElement != endDocElement) {
}
}

if (docs.size() > 0) {
if (!docs.isEmpty()) {
return docs.remove(0);
}
else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,38 +26,61 @@
import opennlp.tools.util.StringUtil;

/**
* SAX style SGML parser.
* <p>
* Note:<br>
* The implementation is very limited, but good enough to
* parse the MUC corpora. Its must very likely be extended/improved/fixed to parse
* a different SGML corpora.
* A SAX style <a href="https://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html">SGML</a> parser.
*
* @implNote The implementation is very limited, but good enough to parse the
* <a href="https://catalog.ldc.upenn.edu/LDC2003T13">MUC corpora</a>.
* Its must very likely be extended/improved/fixed to parse a different SGML corpora.
*/
public class SgmlParser {

public static abstract class ContentHandler {

public void startElement(String name, Map<String, String> attributes) throws InvalidFormatException {
}
private static final char SYMBOL_CLOSE = '>';
private static final char SYMBOL_OPEN = '<';
private static final char SYMBOL_SLASH = '/';
private static final char SYMBOL_EQUALS = '=';
private static final char SYMBOL_QUOT = '"';

public void characters(CharSequence chars) throws InvalidFormatException{
}
/**
* Defines methods to handle content produced by a {@link SgmlParser}.
* A concrete implementation interprets the document specific details.
*/
public static abstract class ContentHandler {

public void endElement(String name) throws InvalidFormatException {
}
/**
* Handles a SGML start element.
*
* @param name The name of the element's start tag.
* @param attributes The attributes supplied with the start tag. It may be empty.
* @throws InvalidFormatException Thrown if parameters were invalid.
*/
public abstract void startElement(String name, Map<String, String> attributes)
throws InvalidFormatException;

/**
* Handles a set of characters between SGML start and end tag.
*
* @param chars The characters to process.
* @throws InvalidFormatException Thrown if parameters were invalid.
*/
public abstract void characters(CharSequence chars)
throws InvalidFormatException;

/**
* Handles a SGML end element.
* @param name The name of the element's end tag.
*/
public abstract void endElement(String name);
}

private static String extractTagName(CharSequence tagChars) throws InvalidFormatException {

int fromOffset = 1;

if (tagChars.length() > 1 && tagChars.charAt(1) == '/') {
if (tagChars.length() > 1 && tagChars.charAt(1) == SYMBOL_SLASH) {
fromOffset = 2;
}

for (int ci = 1; ci < tagChars.length(); ci++) {

if (tagChars.charAt(ci) == '>' || StringUtil.isWhitespace(tagChars.charAt(ci))) {
if (tagChars.charAt(ci) == SYMBOL_CLOSE || StringUtil.isWhitespace(tagChars.charAt(ci))) {
return tagChars.subSequence(fromOffset, ci).toString();
}
}
Expand Down Expand Up @@ -90,15 +113,16 @@ private static Map<String, String> getAttributes(CharSequence tagChars) {
extractKey = true;
}
// Equals sign indicated end of key name
else if (extractKey && ('=' == tagChars.charAt(i) || StringUtil.isWhitespace(tagChars.charAt(i)))) {
else if (extractKey && (SYMBOL_EQUALS == tagChars.charAt(i) ||
StringUtil.isWhitespace(tagChars.charAt(i)))) {
extractKey = false;
}
// Inside key name, extract all chars
else if (extractKey) {
key.append(tagChars.charAt(i));
}
// " Indicates begin or end of value chars
else if ('"' == tagChars.charAt(i)) {
else if (SYMBOL_QUOT == tagChars.charAt(i)) {

if (extractValue) {
attributes.put(key.toString(), value.toString());
Expand All @@ -107,7 +131,6 @@ else if ('"' == tagChars.charAt(i)) {
key.setLength(0);
value.setLength(0);
}

extractValue = !extractValue;
}
// Inside value, extract all chars
Expand All @@ -119,6 +142,17 @@ else if (extractValue) {
return attributes;
}

/**
* Parses an SGML document available via the input in {@link Reader}.
* The specified {@link ContentHandler} is responsible of how to interpret the document
* specific details.
*
* @param in A {@link Reader} that provides the data of the SGML document.
* @param handler The {@link ContentHandler} to interpret the document with.
*
* @throws IOException Thrown if IO errors occurred.
* @throws InvalidFormatException Thrown if parameters were invalid.
*/
public void parse(Reader in, ContentHandler handler) throws IOException {

StringBuilder buffer = new StringBuilder();
Expand All @@ -130,45 +164,37 @@ public void parse(Reader in, ContentHandler handler) throws IOException {
int c;
while ((c = in.read()) != -1) {

if ('<' == c) {
if (SYMBOL_OPEN == c) {
if (isInsideTag) {
throw new InvalidFormatException("Did not expect < char!");
}

if (buffer.toString().trim().length() > 0) {
if (!buffer.toString().trim().isEmpty()) {
handler.characters(buffer.toString().trim());
}

buffer.setLength(0);

isInsideTag = true;
isStartTag = true;
}

buffer.appendCodePoint(c);

if ('/' == c && lastChar == '<') {
if (SYMBOL_SLASH == c && lastChar == SYMBOL_OPEN) {
isStartTag = false;
}

if ('>' == c) {
if (SYMBOL_CLOSE == c) {

if (!isInsideTag) {
throw new InvalidFormatException("Did not expect > char!");
}

if (isStartTag) {
handler.startElement(extractTagName(buffer), getAttributes(buffer));
}
else {
handler.endElement(extractTagName(buffer));
}

buffer.setLength(0);

isInsideTag = false;
}

lastChar = c;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Map;

import org.junit.jupiter.api.Test;

Expand All @@ -34,9 +35,21 @@ void testParse1() throws IOException {
try (Reader in = new InputStreamReader(getResourceStream("muc/parsertest1.sgml"),
StandardCharsets.UTF_8)) {
SgmlParser parser = new SgmlParser();
parser.parse(in, new SgmlParser.ContentHandler() {
});
parser.parse(in, new DummyContentHandler());
}
}

private static class DummyContentHandler extends SgmlParser.ContentHandler {
@Override
public void startElement(String name, Map<String, String> attributes) {
}

@Override
public void characters(CharSequence chars) {
}

@Override
public void endElement(String name) {
}
}
}
Loading