metafacture · dr0i · Jun 25, 2019 · Jun 17, 2019 · Jun 25, 2019
diff --git a/metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaConstants.java b/metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaConstants.java
@@ -1,5 +1,4 @@
-/*
- * Copyright 2016 Christoph Böhme
+/* Copyright 2016,2019 Christoph Böhme and others
  *
  * Licensed under the Apache License, Version 2.0 the "License";
  * you may not use this file except in compliance with the License.
@@ -13,23 +12,45 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.metafacture.biblio.pica;
 
 /**
- * Useful constants for PICA+
+ * Useful constants for PICA+.
+ * PICA+ comes with two possible serializations:
+ * a normalized one and a non-normalized.
  *
- * @author Christoph Böhme
+ * @author Christoph Böhme (initial implementation)
+ * @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
+ * @author Fabian Steeg (fsteeg) (switch to enum)
  *
  */
-final class PicaConstants {
+enum PicaConstants {
+	// We use '\0' for null/empty
+	RECORD_MARKER('\u001d', '\n'), //
+	FIELD_MARKER('\u001e', '\0'), //
+	SUBFIELD_MARKER('\u001f', '$'), //
+	FIELD_END_MARKER('\n', '\n'), //
+	NO_MARKER('\0', '\0');
+
+	char normalized;
+	char nonNormalized;
 
-    public static final char RECORD_MARKER = '\u001d';
-    public static final char FIELD_MARKER = '\u001e';
-    public static final char SUBFIELD_MARKER = '\u001f';
-    public static final char FIELD_END_MARKER = '\n';
+	PicaConstants(char normalized, char nonNormalized) {
+		this.normalized = normalized;
+		this.nonNormalized = nonNormalized;
+	}
 
-    private PicaConstants() {
-        // No instances allowed
-    }
+	public char get(boolean isNormalized) {
+		return isNormalized ? normalized : nonNormalized;
+	}
 
-}
+	public static PicaConstants from(boolean isNormalized, char ch) {
+		for (PicaConstants value : values()) {
+			if (ch == (isNormalized ? value.normalized : value.nonNormalized)) {
+				return value;
+			}
+		}
+		return NO_MARKER;
+	}
+}
diff --git a/metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaDecoder.java b/metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaDecoder.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2016 Christoph Böhme
+ * Copyright 2016, 2019 Christoph Böhme and others
  *
  * Licensed under the Apache License, Version 2.0 the "License";
  * you may not use this file except in compliance with the License.
@@ -32,9 +32,11 @@
  * containing multiple records must be split into individual records before
  * passing it to {@code PicaDecoder}.
  * <p>
- * The parser is designed to accept any string as valid input and to parse pica
- * plain format as well as normalised pica. To achieve this, the parser behaves
- * as following:
+ * The parser is designed to accept any string as valid input and to parse
+ * pica+ in its two serialization forms:
+ * as non-normalized and as normalized.
+ * To achieve this, the parser behaves as following when parsing:
+ * normalized pica+:
  * <ul>
  *   <li>The parser assumes that the input starts with a field name.
  *
@@ -56,6 +58,26 @@
  *   that field names, subfields, subfield names or subfield values can be
  *   empty.
  * </ul>
+ *  * non-normalized pica+:
+ * <ul>
+ *   <li>The parser assumes that the input starts with a field name.
+ *
+ *   <li>The field name and the first subfield are separated by a subfield
+ *   marker ($).
+ *
+ *   <li>Fields are separated by record markers (&#92;n) or field end
+ *   markers (&#92;n).
+ *
+ *   <li>Subfields are separated by subfield markers ($).
+ *
+ *   <li>The first character of a subfield is the name of the subfield
+ *
+ *   <li>The parser assumes that the end of the input marks the end of the
+ *   current field and the end of the record.
+ *
+ *   <li>As multiple fields and subfields are not empty in non-normailzed pica+
+ *   they are just treated like anything else.
+ * </ul>
  * Please note that the record marker is treated as a field delimiter and not
  * as a record delimiter. Records need to be separated prior to parsing them.
  * <p>
@@ -69,7 +91,8 @@
  *
  *   <li>Subfields which only have a name but no value are always parsed.
  *
- *   <li>Unnamed fields are only parsed if the contain not-ignored subfields.
+ *   <li>In normalized pica+ unnamed fields are only parsed if they contain
+ *    not-ignored subfields. In Non-normalized pica+ unnamed fields don't exist.
  *
  *   <li>Named fields containing none or only ignored subfields are only parsed
  *   if {@link #setSkipEmptyFields(boolean)} is set to false otherwise they are
@@ -85,7 +108,7 @@
  * {@link #setTrimFieldNames(boolean)} to false.
  * <p>
  * The record id emitted with the <i>start-record</i> event is extracted from
- * one of the following pica fields:
+ * one of the following non-normalized pica+ fields:
  * <ul>
  *   <li><i>003&#64; $0</i>
  *   <li><i>107F $0</i>
@@ -97,7 +120,7 @@
  * found in the record a {@link MissingIdException} is thrown otherwise the
  * record identifier is an empty string.
  * <p>
- * For example, when run on the input
+ * For example, when run on this input in its normalized serialization form:
  * <pre>
  * 003&#64; &#92;u001f01234&#92;u001e
  * 028A &#92;u001faAndy&#92;u001fdWarhol&#92;u001e
@@ -120,6 +143,8 @@
  * support other pica encodings.
  *
  * @author Christoph Böhme
+ * @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
+ * @author Fabian Steeg (fsteeg) (switch to enum)
  *
  */
 @Description("Parses pica+ records. The parser only parses single records. " +
@@ -131,22 +156,50 @@
 public final class PicaDecoder
         extends DefaultObjectPipe<String, StreamReceiver> {
 
-    private static final String START_MARKERS ="(?:^|" + PicaConstants.FIELD_MARKER +
-            "|" + PicaConstants.FIELD_END_MARKER + "|" + PicaConstants.RECORD_MARKER + ")";
-    private static final Pattern ID_FIELDS_PATTERN = Pattern.compile(
-            START_MARKERS + "(?:003@|203@(?:/..+)?|107F) " + PicaConstants.SUBFIELD_MARKER + "0");
-
+    private static String START_MARKERS;
+    private static Pattern ID_FIELDS_PATTERN;
     private static final int BUFFER_SIZE = 1024 * 1024;
 
-    private final Matcher idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
+    private Matcher idFieldMatcher;
     private final StringBuilder idBuilder = new StringBuilder();
     private final PicaParserContext parserContext = new PicaParserContext();
 
     private char[] buffer = new char[BUFFER_SIZE];
     private int recordLen;
 
     private boolean ignoreMissingIdn;
+    private boolean isNormalized;
+
+    public PicaDecoder() {
+        this(true);
+    }
+
+    public PicaDecoder(boolean normalized) {
+        setNormalizedSerialization(normalized);
+    }
 
+    /**
+     * Controls whether the input is read as normalized or non-normalized
+     * pica+. As the default "normalized" is assumed.
+     *
+     * @param normalized if true, the input is treated as normalized pica+ ;
+     *                   if false, it's treated as non-normalized.
+     */
+    public void setNormalizedSerialization(boolean normalized) {
+        this.isNormalized = normalized;
+        makeConstants();
+    }
+
+    private void makeConstants() {
+        START_MARKERS = "(?:^|" + PicaConstants.FIELD_MARKER.get(isNormalized) + "|"
+                + PicaConstants.FIELD_END_MARKER.get(isNormalized) + "|"
+                + PicaConstants.RECORD_MARKER.get(isNormalized) + "|.*\n" + ")";
+        ID_FIELDS_PATTERN = Pattern
+                .compile(START_MARKERS + "(?:003@|203@(?:/..+)?|107F) "
+                        + " ?(\\" + PicaConstants.SUBFIELD_MARKER.get(isNormalized) + "|"
+                        + PicaConstants.SUBFIELD_MARKER.get(isNormalized) + ")0");
+        idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
+    }
     /**
      * Controls whether records having no record id are reported as faulty. By
      * default such records are reported by the {@code PicaDecoder} by throwing
@@ -250,7 +303,7 @@ public void process(final String record) {
 
         PicaParserState state = PicaParserState.FIELD_NAME;
         for (int i = 0; i < recordLen; ++i) {
-            state = state.parseChar(buffer[i], parserContext);
+            state = state.parseChar(buffer[i], parserContext, isNormalized);
         }
         state.endOfInput(parserContext);
 
@@ -284,7 +337,7 @@ private String extractRecordId() {
         idBuilder.setLength(0);
         for (int i = idFromIndex; i < recordLen; ++i) {
             final char ch = buffer[i];
-            if (isSubfieldDelimiter(ch)) {
+            if (isMarker(ch)) {
                 break;
             }
             idBuilder.append(ch);
@@ -300,11 +353,8 @@ private int findRecordId() {
         return idFieldMatcher.end();
     }
 
-    private static boolean isSubfieldDelimiter(final char ch) {
-        return ch == PicaConstants.RECORD_MARKER
-                || ch == PicaConstants.FIELD_MARKER
-                || ch == PicaConstants.FIELD_END_MARKER
-                || ch == PicaConstants.SUBFIELD_MARKER;
+    private boolean isMarker(final char ch) {
+        return PicaConstants.from(isNormalized, ch) != PicaConstants.NO_MARKER;
     }
 
 }
diff --git a/metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaParserState.java b/metafacture-biblio/src/main/java/org/metafacture/biblio/pica/PicaParserState.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2016 Christoph Böhme
+ * Copyright 2016,2019 Christoph Böhme and others
  *
  * Licensed under the Apache License, Version 2.0 the "License";
  * you may not use this file except in compliance with the License.
@@ -30,23 +30,24 @@
  * skips unnamed fields without any subfields.
  *
  * @author Christoph Böhme
- *
+ * @author Pascal Christoph (dr0i) (add support for non-normalized pica+)
+ * @author Fabian Steeg (fsteeg) (switch to enum)
  */
 enum PicaParserState {
 
     FIELD_NAME {
         @Override
-        protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
+        protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
             final PicaParserState next;
-            switch (ch) {
-            case PicaConstants.RECORD_MARKER:
-            case PicaConstants.FIELD_MARKER:
-            case PicaConstants.FIELD_END_MARKER:
+            switch (PicaConstants.from(normalized, ch)) {
+            case RECORD_MARKER:
+            case FIELD_MARKER:
+            case FIELD_END_MARKER:
                 ctx.emitStartEntity();
                 ctx.emitEndEntity();
                 next = FIELD_NAME;
                 break;
-            case PicaConstants.SUBFIELD_MARKER:
+            case SUBFIELD_MARKER:
                 ctx.emitStartEntity();
                 next = SUBFIELD_NAME;
                 break;
@@ -65,16 +66,16 @@ protected void endOfInput(final PicaParserContext ctx) {
     },
     SUBFIELD_NAME {
         @Override
-        protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
+        protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
             final PicaParserState next;
-            switch (ch) {
-            case PicaConstants.RECORD_MARKER:
-            case PicaConstants.FIELD_MARKER:
-            case PicaConstants.FIELD_END_MARKER:
+            switch (PicaConstants.from(normalized, ch)) {
+            case RECORD_MARKER:
+            case FIELD_MARKER:
+            case FIELD_END_MARKER:
                 ctx.emitEndEntity();
                 next = FIELD_NAME;
                 break;
-            case PicaConstants.SUBFIELD_MARKER:
+            case SUBFIELD_MARKER:
                 next = this;
                 break;
             default:
@@ -91,17 +92,17 @@ protected void endOfInput(final PicaParserContext ctx) {
     },
     SUBFIELD_VALUE {
         @Override
-        protected PicaParserState parseChar(final char ch, final PicaParserContext ctx) {
+        protected PicaParserState parseChar(final char ch, final PicaParserContext ctx, boolean normalized) {
             final PicaParserState next;
-            switch (ch) {
-            case PicaConstants.RECORD_MARKER:
-            case PicaConstants.FIELD_MARKER:
-            case PicaConstants.FIELD_END_MARKER:
+            switch (PicaConstants.from(normalized, ch)) {
+            case RECORD_MARKER:
+            case FIELD_MARKER:
+            case FIELD_END_MARKER:
                 ctx.emitLiteral();
                 ctx.emitEndEntity();
                 next = FIELD_NAME;
                 break;
-            case PicaConstants.SUBFIELD_MARKER:
+            case SUBFIELD_MARKER:
                 ctx.emitLiteral();
                 next = SUBFIELD_NAME;
                 break;
@@ -119,7 +120,7 @@ protected void endOfInput(final PicaParserContext ctx) {
         }
     };
 
-    protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx);
+    protected abstract PicaParserState parseChar(final char ch, final PicaParserContext ctx, final boolean normalized);
 
     protected abstract void endOfInput(final PicaParserContext ctx);