Fixes GH-63: BufferUtils.ensureCapacity now clears the input buffer. …

…This also affects WordData methods that accept a reusable byte buffer -- it is now always cleared prior to being flipped and returned.
morfologik · Feb 17, 2016 · f9592c5 · f9592c5
1 parent b48255d
commit f9592c5
Show file tree

Hide file tree

Showing 13 changed files with 65 additions and 79 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -11,6 +11,12 @@ New Features
 
  * GH-73: Update Polish stemming dictionaries to polimorfologik 2.1. (Dawid Weiss)
 
+Other Changes
+
+ * GH-63: BufferUtils.ensureCapacity now clears the input buffer. This also
+   affects WordData methods that accept a reusable byte buffer -- it is now
+   always cleared prior to being flipped and returned. (Dawid Weiss)
+
 ======================= morfologik-stemming 2.0.2-SNAPSHOT =======================
 
 Bug Fixes

diff --git a/morfologik-polish/src/test/java/morfologik/stemming/polish/PolishMorfologikStemmerTest.java b/morfologik-polish/src/test/java/morfologik/stemming/polish/PolishMorfologikStemmerTest.java
@@ -89,7 +89,6 @@ public void testWordDataFields() throws IOException {
       // The buffer should be present in stems set.
       assertTrue(stem, stems.contains(stem));
       // Buffer large enough to hold the contents.
-      temp.clear();
       assertSame(temp, wd.getStemBytes(temp));
       // The copy and the clone should be identical.
       assertEquals(0, copy.compareTo(temp));

diff --git a/morfologik-speller/src/main/java/morfologik/speller/Speller.java b/morfologik-speller/src/main/java/morfologik/speller/Speller.java
@@ -205,8 +205,7 @@ private ByteBuffer charsToBytes(final CharBuffer chars, ByteBuffer bytes) {
 
   private ByteBuffer charSequenceToBytes(final CharSequence word) {
     // Encode word characters into bytes in the same encoding as the FSA's.
-    charBuffer.clear();
-    charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
+    charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, word.length());
     for (int i = 0; i < word.length(); i++) {
       final char chr = word.charAt(i);
       charBuffer.put(chr);
@@ -416,10 +415,8 @@ public List<String> findReplacements(final String w) {
           candidate = new char[MAX_WORD_LENGTH];
           candLen = candidate.length;
           effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance;
-          charBuffer = BufferUtils.ensureCapacity(charBuffer, MAX_WORD_LENGTH);
-          byteBuffer = BufferUtils.ensureCapacity(byteBuffer, MAX_WORD_LENGTH);
-          charBuffer.clear();
-          byteBuffer.clear();
+          charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, MAX_WORD_LENGTH);
+          byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, MAX_WORD_LENGTH);
           final byte[] prevBytes = new byte[0];
           findRepl(0, fsa.getRootNode(), prevBytes, 0, 0);
         }
@@ -443,8 +440,7 @@ private void findRepl(final int depth, final int node, final byte[] prevBytes, f
     // char separatorChar = dictionaryMetadata.getSeparatorAsChar();
     int dist = 0;
     for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
-      byteBuffer = BufferUtils.ensureCapacity(byteBuffer, prevBytes.length + 1);
-      byteBuffer.clear();
+      byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, prevBytes.length + 1);
       byteBuffer.put(prevBytes);
       byteBuffer.put(fsa.getArcLabel(arc));
       final int bufPos = byteBuffer.position();

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/BufferUtils.java b/morfologik-stemming/src/main/java/morfologik/stemming/BufferUtils.java
@@ -13,41 +13,45 @@ private BufferUtils() {
   }
 
   /**
-   * Ensure the byte buffer's capacity. If a new buffer is allocated, its
-   * content is empty (the old buffer's contents is not copied).
+   * Ensure the buffer's capacity is large enough to hold a given number
+   * of elements. If the input buffer is not large enough, a new buffer is allocated
+   * and returned.
    * 
-   * @param capacity The required capacity of the buffer.
+   * @param elements The required number of elements to be appended to the buffer.
    * 
    * @param buffer
    *          The buffer to check or <code>null</code> if a new buffer should be
    *          allocated.
    *
    * @return Returns the same buffer or a new buffer with the given capacity. 
    */
-  public static ByteBuffer ensureCapacity(ByteBuffer buffer, int capacity) {
-    // TODO: GH-63 
-    if (buffer == null || buffer.capacity() < capacity) {
-      buffer = ByteBuffer.allocate(capacity);
+  public static ByteBuffer clearAndEnsureCapacity(ByteBuffer buffer, int elements) {
+    if (buffer == null || buffer.capacity() < elements) {
+      buffer = ByteBuffer.allocate(elements);
+    } else {
+      buffer.clear();
     }
     return buffer;
   }
 
   /**
-   * Ensure the char buffer's capacity. If a new buffer is allocated, its
-   * content is empty (the old buffer's contents is not copied).
+   * Ensure the buffer's capacity is large enough to hold a given number
+   * of elements. If the input buffer is not large enough, a new buffer is allocated
+   * and returned.
    * 
-   * @param capacity The required capacity of the buffer.
+   * @param elements The required number of elements to be appended to the buffer.
    * 
    * @param buffer
    *          The buffer to check or <code>null</code> if a new buffer should be
    *          allocated.
    *
    * @return Returns the same buffer or a new buffer with the given capacity. 
    */
-  public static CharBuffer ensureCapacity(CharBuffer buffer, int capacity) {
-    // TODO: GH-63 
-    if (buffer == null || buffer.capacity() < capacity) {
-      buffer = CharBuffer.allocate(capacity);
+  public static CharBuffer clearAndEnsureCapacity(CharBuffer buffer, int elements) {
+    if (buffer == null || buffer.capacity() < elements) {
+      buffer = CharBuffer.allocate(elements);
+    } else {
+      buffer.clear();
     }
     return buffer;
   }

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java
@@ -54,16 +54,14 @@ public WordData next() {
       throw new RuntimeException("Invalid dictionary " + "entry format (missing separator).");
     }
 
-    inflectedBuffer.clear();
-    inflectedBuffer = BufferUtils.ensureCapacity(inflectedBuffer, sepPos);
+    inflectedBuffer = BufferUtils.clearAndEnsureCapacity(inflectedBuffer, sepPos);
     inflectedBuffer.put(ba, 0, sepPos);
     inflectedBuffer.flip();
 
     inflectedCharBuffer = bytesToChars(inflectedBuffer, inflectedCharBuffer);
     entry.update(inflectedBuffer, inflectedCharBuffer);
 
-    temp.clear();
-    temp = BufferUtils.ensureCapacity(temp, bbSize - sepPos);
+    temp = BufferUtils.clearAndEnsureCapacity(temp, bbSize - sepPos);
     sepPos++;
     temp.put(ba, sepPos, bbSize - sepPos);
     temp.flip();
@@ -88,8 +86,7 @@ public WordData next() {
                                             inflectedBuffer,
                                             ByteBuffer.wrap(ba, 0, sepPos));
     } else {
-      entry.stemBuffer = BufferUtils.ensureCapacity(entry.stemBuffer, sepPos);
-      entry.stemBuffer.clear();
+      entry.stemBuffer = BufferUtils.clearAndEnsureCapacity(entry.stemBuffer, sepPos);
       entry.stemBuffer.put(ba, 0, sepPos);
       entry.stemBuffer.flip();
     }
@@ -102,8 +99,7 @@ public WordData next() {
     /*
      * Decode the tag data.
      */
-    entry.tagBuffer = BufferUtils.ensureCapacity(entry.tagBuffer, bbSize - sepPos);
-    entry.tagBuffer.clear();
+    entry.tagBuffer = BufferUtils.clearAndEnsureCapacity(entry.tagBuffer, bbSize - sepPos);
     entry.tagBuffer.put(ba, sepPos, bbSize - sepPos);
     entry.tagBuffer.flip();
 

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java b/morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java
@@ -148,8 +148,7 @@ public List<WordData> lookup(CharSequence word) {
     formsList.wrap(forms, 0, 0);
 
     // Encode word characters into bytes in the same encoding as the FSA's.
-    charBuffer.clear();
-    charBuffer = BufferUtils.ensureCapacity(charBuffer, word.length());
+    charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, word.length());
     for (int i = 0; i < word.length(); i++) {
       char chr = word.charAt(i);
       if (chr == separatorChar)
@@ -229,8 +228,7 @@ public List<WordData> lookup(CharSequence word) {
            */
           final int tagSize = bbSize - sepPos;
           if (tagSize > 0) {
-            wordData.tagBuffer = BufferUtils.ensureCapacity(wordData.tagBuffer, tagSize);
-            wordData.tagBuffer.clear();
+            wordData.tagBuffer = BufferUtils.clearAndEnsureCapacity(wordData.tagBuffer, tagSize);
             wordData.tagBuffer.put(ba, sepPos, tagSize);
             wordData.tagBuffer.flip();
           }
@@ -278,8 +276,7 @@ public static String applyReplacements(CharSequence word, LinkedHashMap<String,
    */
   private ByteBuffer charsToBytes(CharBuffer chars, ByteBuffer bytes) {
     bytes.clear();
-    final int maxCapacity = (int) (chars.remaining() * encoder
-        .maxBytesPerChar());
+    final int maxCapacity = (int) (chars.remaining() * encoder.maxBytesPerChar());
     if (bytes.capacity() <= maxCapacity) {
       bytes = ByteBuffer.allocate(maxCapacity);
     }

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/NoEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/NoEncoder.java
@@ -8,8 +8,7 @@
 public class NoEncoder implements ISequenceEncoder {
   @Override
   public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) {
-    reuse = BufferUtils.ensureCapacity(reuse, target.remaining());
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, target.remaining());
 
     target.mark();
     reuse.put(target)
@@ -21,8 +20,7 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
 
   @Override
   public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
-    reuse = BufferUtils.ensureCapacity(reuse, encoded.remaining());
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, encoded.remaining());
 
     encoded.mark();
     reuse.put(encoded)

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/TrimInfixAndSuffixEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/TrimInfixAndSuffixEncoder.java
@@ -64,8 +64,7 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
         // Compute temporary src with the infix removed.
         // Concatenate in scratch space for simplicity.
         final int len2 = source.remaining() - (i + j);
-        scratch = BufferUtils.ensureCapacity(scratch, i + len2);
-        scratch.clear();
+        scratch = BufferUtils.clearAndEnsureCapacity(scratch, i + len2);
         scratch.put(source.array(), 0, i);
         scratch.put(source.array(), i + j, len2);
         scratch.flip();
@@ -99,8 +98,7 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
     }
 
     final int len1 = target.remaining() - maxSubsequenceLength;
-    reuse = BufferUtils.ensureCapacity(reuse, 3 + len1);
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, 3 + len1);
 
     reuse.put((byte) ((maxInfixIndex + 'A') & 0xFF));
     reuse.put((byte) ((maxInfixLength + 'A') & 0xFF));
@@ -128,8 +126,7 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded
 
     final int len1 = source.remaining() - (infixIndex + infixLength + truncateSuffixBytes);
     final int len2 = encoded.remaining() - 3;
-    reuse = BufferUtils.ensureCapacity(reuse, infixIndex + len1 + len2);
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, infixIndex + len1 + len2);
 
     assert encoded.hasArray() && 
            encoded.position() == 0 && 

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/TrimPrefixAndSuffixEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/TrimPrefixAndSuffixEncoder.java
@@ -60,8 +60,7 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
     }
 
     final int len1 = target.remaining() - maxSubsequenceLength;
-    reuse = BufferUtils.ensureCapacity(reuse, 2 + len1);
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, 2 + len1);
 
     assert target.hasArray() && 
            target.position() == 0 && 
@@ -98,8 +97,7 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded
 
     final int len1 = source.remaining() - (truncateSuffixBytes + truncatePrefixBytes);
     final int len2 = encoded.remaining() - 2;
-    reuse = BufferUtils.ensureCapacity(reuse, len1 + len2);
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2);
 
     reuse.put(source.array(), truncatePrefixBytes, len1);
     reuse.put(encoded.array(), 2, len2);

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/TrimSuffixEncoder.java b/morfologik-stemming/src/main/java/morfologik/stemming/TrimSuffixEncoder.java
@@ -42,8 +42,7 @@ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target)
       sharedPrefix = 0;
     }
 
-    reuse = BufferUtils.ensureCapacity(reuse, 1 + target.remaining() - sharedPrefix); 
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, 1 + target.remaining() - sharedPrefix); 
 
     assert target.hasArray() && 
            target.position() == 0 && 
@@ -69,8 +68,7 @@ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded
     final int len1 = source.remaining() - truncateBytes;
     final int len2 = encoded.remaining() - 1;
 
-    reuse = BufferUtils.ensureCapacity(reuse, len1 + len2);
-    reuse.clear();
+    reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2);
 
     assert source.hasArray() && 
            source.position() == 0 && 

diff --git a/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java b/morfologik-stemming/src/main/java/morfologik/stemming/WordData.java
@@ -91,8 +91,11 @@ public final class WordData implements Cloneable {
 
 	/**
 	 * Copy the stem's binary data (no charset decoding) to a custom byte
-	 * buffer. If the buffer is null or not large enough to hold the result, a
-	 * new buffer is allocated.
+	 * buffer.
+	 *
+	 * The buffer is cleared prior to copying and flipped for reading
+	 * upon returning from this method. If the buffer is null or not large 
+	 * enough to hold the result, a new buffer is allocated.
 	 * 
 	 * @param target
 	 *            Target byte buffer to copy the stem buffer to or
@@ -101,8 +104,7 @@ public final class WordData implements Cloneable {
 	 * @return Returns <code>target</code> or the new reallocated buffer.
 	 */
 	public ByteBuffer getStemBytes(ByteBuffer target) {
-		target = BufferUtils.ensureCapacity(target, stemBuffer.remaining());
-    assert target.position() == 0;
+		target = BufferUtils.clearAndEnsureCapacity(target, stemBuffer.remaining());
 		stemBuffer.mark();
 		target.put(stemBuffer);
 		stemBuffer.reset();
@@ -112,8 +114,10 @@ public ByteBuffer getStemBytes(ByteBuffer target) {
 
 	/**
 	 * Copy the tag's binary data (no charset decoding) to a custom byte buffer.
-	 * If the buffer is null or not large enough to hold the result, a new
-	 * buffer is allocated.
+	 * 
+   * The buffer is cleared prior to copying and flipped for reading
+   * upon returning from this method. If the buffer is null or not large 
+   * enough to hold the result, a new buffer is allocated.
 	 * 
 	 * @param target
 	 *            Target byte buffer to copy the tag buffer to or
@@ -122,8 +126,7 @@ public ByteBuffer getStemBytes(ByteBuffer target) {
 	 * @return Returns <code>target</code> or the new reallocated buffer.
 	 */
 	public ByteBuffer getTagBytes(ByteBuffer target) {
-		target = BufferUtils.ensureCapacity(target, tagBuffer.remaining());
-    assert target.position() == 0;
+		target = BufferUtils.clearAndEnsureCapacity(target, tagBuffer.remaining());
 		tagBuffer.mark();
 		target.put(tagBuffer);
 		tagBuffer.reset();
@@ -133,8 +136,11 @@ public ByteBuffer getTagBytes(ByteBuffer target) {
 
 	/**
 	 * Copy the inflected word's binary data (no charset decoding) to a custom
-	 * byte buffer. If the buffer is null or not large enough to hold the
-	 * result, a new buffer is allocated.
+	 * byte buffer.
+	 * 
+   * The buffer is cleared prior to copying and flipped for reading
+   * upon returning from this method. If the buffer is null or not large 
+   * enough to hold the result, a new buffer is allocated. 
 	 * 
 	 * @param target
 	 *            Target byte buffer to copy the word buffer to or
@@ -143,8 +149,7 @@ public ByteBuffer getTagBytes(ByteBuffer target) {
 	 * @return Returns <code>target</code> or the new reallocated buffer.
 	 */
 	public ByteBuffer getWordBytes(ByteBuffer target) {
-		target = BufferUtils.ensureCapacity(target, wordBuffer.remaining());
-    assert target.position() == 0;
+		target = BufferUtils.clearAndEnsureCapacity(target, wordBuffer.remaining());
 		wordBuffer.mark();
 		target.put(wordBuffer);
 		wordBuffer.reset();

diff --git a/morfologik-tools/src/main/java/morfologik/tools/DictCompile.java b/morfologik-tools/src/main/java/morfologik/tools/DictCompile.java
@@ -148,29 +148,25 @@ public ExitStatus call() throws Exception {
         sep2 = row.length;
       }
 
-      source.clear();
-      source = BufferUtils.ensureCapacity(source, sep1);
+      source = BufferUtils.clearAndEnsureCapacity(source, sep1);
       source.put(row, 0, sep1);
       source.flip();
 
       final int len = sep2 - (sep1 + 1);
-      target.clear();
-      target = BufferUtils.ensureCapacity(target, len);
+      target = BufferUtils.clearAndEnsureCapacity(target, len);
       target.put(row, sep1 + 1, len);
       target.flip();
 
       final int len2 = row.length - (sep2 + 1);
-      tag.clear();
+      tag = BufferUtils.clearAndEnsureCapacity(tag, len2);
       if (len2 > 0) {
-        tag = BufferUtils.ensureCapacity(tag, len2);
         tag.put(row, sep2 + 1, len2);
       }
       tag.flip();
 
       encoded = sequenceEncoder.encode(encoded, target, source);
 
-      assembled.clear();
-      assembled = BufferUtils.ensureCapacity(assembled, 
+      assembled = BufferUtils.clearAndEnsureCapacity(assembled, 
           target.remaining() + 1 +
           encoded.remaining() + 1 + 
           tag.remaining());

diff --git a/morfologik-tools/src/main/java/morfologik/tools/DictDecompile.java b/morfologik-tools/src/main/java/morfologik/tools/DictDecompile.java
@@ -78,7 +78,6 @@ public ExitStatus call() throws Exception {
     try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(output))) {
       boolean hasTags = false;
       for (WordData wd : lookup) {
-        tag.clear();
         tag = wd.getTagBytes(tag);
         if (tag.hasRemaining()) {
           hasTags = true;
@@ -87,11 +86,8 @@ public ExitStatus call() throws Exception {
       }
 
       for (WordData wd : lookup) {
-        stem.clear();
         stem = wd.getStemBytes(stem);
-        word.clear();
         word = wd.getWordBytes(word);
-        tag.clear();
         tag = wd.getTagBytes(tag);
 
         write(os, stem);