Skip to content

Commit

Permalink
Switch to enum for PicaConstants
Browse files Browse the repository at this point in the history
Avoid reassignment to static fields, keep switch in parser

See #296
  • Loading branch information
fsteeg committed Jun 25, 2019
1 parent e3d9674 commit 9c28a07
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 74 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2016,2019 Christoph Böhme and hbz
/* Copyright 2016,2019 Christoph Böhme and others
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
Expand All @@ -13,40 +13,44 @@
* limitations under the License.
*/

package org.metafacture.biblio.pica;

/**
* Useful constants for PICA+.
* PICA+ comes with two possible serializations:
* a normalized one and a non-normalized.
*
* @author Christoph Böhme
* @author Pascal Christoph (dr0i)
* @author Christoph Böhme (initial implementation)
* @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
* @author Fabian Steeg (fsteeg) (switch to enum)
*
*/
enum PicaConstants {
// We use '\0' for null/empty
RECORD_MARKER('\u001d', '\n'), //
FIELD_MARKER('\u001e', '\0'), //
SUBFIELD_MARKER('\u001f', '$'), //
FIELD_END_MARKER('\n', '\n'), //
NO_MARKER('\0', '\0');

package org.metafacture.biblio.pica;

final class PicaConstants{
public static char RECORD_MARKER = '\u001d';
public static char FIELD_MARKER = '\u001e';
public static char SUBFIELD_MARKER = '\u001f';
public static char FIELD_END_MARKER = '\n';

public static void setNormalizedSerialization() {
RECORD_MARKER = '\u001d';
FIELD_MARKER = '\u001e';
SUBFIELD_MARKER = '\u001f';
FIELD_END_MARKER = '\n';
}
char normalized;
char nonNormalized;

public static void setNonNormalizedSerialization() {
RECORD_MARKER = '\n';
FIELD_MARKER = '\n'; //this is a dummy
SUBFIELD_MARKER = '$';
FIELD_END_MARKER = '\n';
}
PicaConstants(char normalized, char nonNormalized) {
this.normalized = normalized;
this.nonNormalized = nonNormalized;
}

private PicaConstants() {
// No instances allowed
}
public char get(boolean isNormalized) {
return isNormalized ? normalized : nonNormalized;
}

}
public static PicaConstants from(boolean isNormalized, char ch) {
for (PicaConstants value : values()) {
if (ch == (isNormalized ? value.normalized : value.nonNormalized)) {
return value;
}
}
return NO_MARKER;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2016, 2019 Christoph Böhme and hbz
* Copyright 2016, 2019 Christoph Böhme and others
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -143,7 +143,8 @@
* support other pica encodings.
*
* @author Christoph Böhme
* @author Pascal Christoph (dr0i)
* @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
* @author Fabian Steeg (fsteeg) (switch to enum)
*
*/
@Description("Parses pica+ records. The parser only parses single records. " +
Expand All @@ -167,37 +168,36 @@ public final class PicaDecoder
private int recordLen;

private boolean ignoreMissingIdn;
private boolean isNormalized;

public PicaDecoder() {
makeConstants();
this(true);
}

public PicaDecoder(boolean normalized) {
setNormalizedSerialization(normalized);
makeConstants();
}

/**
* Controls wether the input is serialzed as normalized or non-normalized
* Controls whether the input is read as normalized or non-normalized
* pica+. As the default "normalized" is assumed.
*
* @param normalized if true, the input is treated as "normalized" pica+ ;
* if false, it's treated as non-normalized serialized.
* @param normalized if true, the input is treated as normalized pica+ ;
* if false, it's treated as non-normalized.
*/
public void setNormalizedSerialization(boolean normalized) {
if (normalized)
PicaConstants.setNormalizedSerialization();
else
PicaConstants.setNonNormalizedSerialization();
this.isNormalized = normalized;
makeConstants();
}

private void makeConstants() {
START_MARKERS = "(?:^|" + PicaConstants.FIELD_MARKER + "|"
+ PicaConstants.FIELD_END_MARKER + "|"
+ PicaConstants.RECORD_MARKER + "|.*\n" + ")";
START_MARKERS = "(?:^|" + PicaConstants.FIELD_MARKER.get(isNormalized) + "|"
+ PicaConstants.FIELD_END_MARKER.get(isNormalized) + "|"
+ PicaConstants.RECORD_MARKER.get(isNormalized) + "|.*\n" + ")";
ID_FIELDS_PATTERN = Pattern
.compile(START_MARKERS + "(?:003@|203@(?:/..+)?|107F) "
+ " ?(\\" + PicaConstants.SUBFIELD_MARKER + "|"
+ PicaConstants.SUBFIELD_MARKER + ")0");
+ " ?(\\" + PicaConstants.SUBFIELD_MARKER.get(isNormalized) + "|"
+ PicaConstants.SUBFIELD_MARKER.get(isNormalized) + ")0");
idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
}
/**
Expand Down Expand Up @@ -303,7 +303,7 @@ public void process(final String record) {

PicaParserState state = PicaParserState.FIELD_NAME;
for (int i = 0; i < recordLen; ++i) {
state = state.parseChar(buffer[i], parserContext);
state = state.parseChar(buffer[i], parserContext, isNormalized);
}
state.endOfInput(parserContext);

Expand Down Expand Up @@ -337,7 +337,7 @@ private String extractRecordId() {
idBuilder.setLength(0);
for (int i = idFromIndex; i < recordLen; ++i) {
final char ch = buffer[i];
if (isSubfieldDelimiter(ch)) {
if (isMarker(ch)) {
break;
}
idBuilder.append(ch);
Expand All @@ -353,11 +353,8 @@ private int findRecordId() {
return idFieldMatcher.end();
}

private static boolean isSubfieldDelimiter(final char ch) {
return ch == PicaConstants.RECORD_MARKER
|| ch == PicaConstants.FIELD_MARKER
|| ch == PicaConstants.FIELD_END_MARKER
|| ch == PicaConstants.SUBFIELD_MARKER;
private boolean isMarker(final char ch) {
return PicaConstants.from(isNormalized, ch) != PicaConstants.NO_MARKER;
}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2016,2019 Christoph Böhme and hbz
* Copyright 2016,2019 Christoph Böhme and others
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
Expand All @@ -25,30 +25,33 @@
* The parser ignores spaces in field names. They are not included in the
* field name.
*
* Empty subfields are skipped. For instance, parsing the following normalized
* pica+ would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
* Empty subfields are skipped. For instance, parsing the following input
* would NOT produce an empty literal: 003@ \u001f\u001e. The parser also
* skips unnamed fields without any subfields.
*
* @author Christoph Böhme
* @author Pascal Christoph (dr0i)
*
* @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
* @author Fabian Steeg (fsteeg) (switch to enum)
*/
enum PicaParserState {

FIELD_NAME {
@Override
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
final PicaParserState next;
if(ch==PicaConstants.RECORD_MARKER ||
ch==PicaConstants.FIELD_MARKER ||
ch==PicaConstants.FIELD_END_MARKER){
switch (PicaConstants.from(normalized, ch)) {
case RECORD_MARKER:
case FIELD_MARKER:
case FIELD_END_MARKER:
ctx.emitStartEntity();
ctx.emitEndEntity();
next = FIELD_NAME;
}else if(ch==PicaConstants.SUBFIELD_MARKER){
break;
case SUBFIELD_MARKER:
ctx.emitStartEntity();
next = SUBFIELD_NAME;
}else{
break;
default:
ctx.appendText(ch);
next = this;
}
Expand All @@ -63,16 +66,19 @@ protected void endOfInput(final PicaParserContext ctx) {
},
SUBFIELD_NAME {
@Override
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
final PicaParserState next;
if(ch==PicaConstants.RECORD_MARKER ||
ch==PicaConstants.FIELD_MARKER ||
ch==PicaConstants.FIELD_END_MARKER){
switch (PicaConstants.from(normalized, ch)) {
case RECORD_MARKER:
case FIELD_MARKER:
case FIELD_END_MARKER:
ctx.emitEndEntity();
next = FIELD_NAME;
}else if(ch==PicaConstants.SUBFIELD_MARKER)
break;
case SUBFIELD_MARKER:
next = this;
else{
break;
default:
ctx.setSubfieldName(ch);
next = SUBFIELD_VALUE;
}
Expand All @@ -86,18 +92,21 @@ protected void endOfInput(final PicaParserContext ctx) {
},
SUBFIELD_VALUE {
@Override
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
final PicaParserState next;
if(ch==PicaConstants.RECORD_MARKER ||
ch==PicaConstants.FIELD_MARKER ||
ch==PicaConstants.FIELD_END_MARKER){
switch (PicaConstants.from(normalized, ch)) {
case RECORD_MARKER:
case FIELD_MARKER:
case FIELD_END_MARKER:
ctx.emitLiteral();
ctx.emitEndEntity();
next = FIELD_NAME;
}else if(ch==PicaConstants.SUBFIELD_MARKER){
break;
case SUBFIELD_MARKER:
ctx.emitLiteral();
next = SUBFIELD_NAME;
}else{
break;
default:
ctx.appendText(ch);
next = this;
}
Expand All @@ -111,7 +120,8 @@ protected void endOfInput(final PicaParserContext ctx) {
}
};

protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx);
protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx, final boolean normalized);

protected abstract void endOfInput(final PicaParserContext ctx);

}

0 comments on commit 9c28a07

Please sign in to comment.