Text extraction improvements and bug fixes

Word extraction will respect spaces if they are present in strings stored in the PDF, and not try to split or recombine them inappropriately. Since most PDfs now have pretty logically-organized content streams, the old graphical reassembly is less important. Resulting word rectangles are now correct for more cases of odd PDFs. Rotated pages are not yet handled, but media boxes with non-zero offsets now work. Font character decoding works for Identity-H encoding which is one form of multi-byte encoding. The decoding has been reorganized as previously, measuring was based on the characters themselves, not the code points that were originally parsed. This meant that some measurements would be off, e.g. in the case of ligatures (which return multiple characters, but are only a single code point). Some minor adjustments were made in CMap and CMapAware font handling so that this could be done.
LibrePDF · Apr 20, 2018 · 0c341c8 · 0c341c8
1 parent 4dc66b3
commit 0c341c8
Show file tree

Hide file tree

Showing 10 changed files with 697 additions and 256 deletions.
diff --git a/openpdf/src/main/java/com/lowagie/text/pdf/CMapAwareDocumentFont.java b/openpdf/src/main/java/com/lowagie/text/pdf/CMapAwareDocumentFont.java
@@ -200,7 +200,7 @@ public int getWidth(int char1) {
 	 *         font's encoding.
 	 */
 	private String decodeSingleCID(byte[] bytes, int offset, int len) {
-		if (toUnicodeCmap != null) {
+		if (hasUnicodeCMAP()) {
 			if (offset + len > bytes.length) {
 				throw new ArrayIndexOutOfBoundsException(
 						MessageLocalization.getComposedMessage(
@@ -216,32 +216,87 @@ private String decodeSingleCID(byte[] bytes, int offset, int len) {
 		throw new Error("Multi-byte glyphs not implemented yet");
 	}
 
-	/**
-	 * Decodes a string of bytes (encoded in the font's encoding) into a unicode
-	 * string This will use the ToUnicode map of the font, if available,
-	 * otherwise it uses the font's encoding
-	 * 
-	 * @param cidbytes
-	 *            the bytes that need to be decoded
-	 * @return the unicode String that results from decoding
-	 * @since 2.1.7
-	 */
-	public String decode(byte[] cidbytes, final int offset, final int len) {
-		StringBuffer sb = new StringBuffer(); // it's a shame we can't make this
-												// StringBuilder
-		for (int i = offset; i < offset + len; i++) {
-			String rslt = decodeSingleCID(cidbytes, i, 1);
-			if (rslt == null && i + 1 < offset + len) {
-				rslt = decodeSingleCID(cidbytes, i, 2);
-				i++;
-			}
-			if (rslt != null) {
-				sb.append(rslt);
-			}
-		}
+    /**
+     * @return true if this font has unicode information available.
+     */
+    public boolean hasUnicodeCMAP() {
+        return toUnicodeCmap != null;
+    }
 
-		return sb.toString();
-	}
+    /**
+     * Decodes a string of bytes (encoded in the font's encoding) into a unicode string. This will
+     * use the ToUnicode map of the font, if available, otherwise it uses the font's encoding
+     *
+     * @param cidbytes
+     *            the bytes that need to be decoded
+     * @return the unicode String that results from decoding
+     * @since 2.1.7
+     */
+    public String decode(byte[] cidbytes,
+                         final int offset,
+                         final int len) {
+        StringBuffer sb = new StringBuffer(); // it's a shame we can't make this
+                                             // StringBuilder
+        for (int i = offset; i < offset + len; i++ ) {
+            String rslt = decodeSingleCID(cidbytes, i, 1);
+            if (rslt == null && i + 1 < offset + len) {
+                rslt = decodeSingleCID(cidbytes, i, 2);
+                i++ ;
+            }
+            if (rslt != null) {
+                sb.append(rslt);
+            }
+        }
+
+        return sb.toString();
+    }
+
+    /**
+     * Decodes a string. This is a normal Java string, but if the range of character values
+     * exceeds the range of the encoding for the font, this will fail. Required since we need to
+     * process the characters of strings, and we can't determine the character boundaries in
+     * advance, especially because of Identity-H encoded fonts which have two-byte character
+     * indexes.
+     * 
+     * PdfString is used to hold character code points, even though the bytes may not map 1-1. It's
+     * not possible to change the encoding once a string is in place. 
+     * 
+     * @param chars
+     *            the Characters that need to be decoded
+     * @return the unicode String that results from decoding
+     * @since 2.1.
+     */
+    public String decode(String chars) {
+        StringBuffer sb = new StringBuffer(); // it's a shame we can't make this
+                                             // StringBuilder
+        for (char c : chars.toCharArray()) {
+            String result = decode(c);
+            if (result != null) {
+                sb.append(result);
+            }
+        }
+
+        return sb.toString();
+    }
+
+    /**
+     * Decode  single character whose value represents a code point in this font. Will fail if
+     * the characters do not have values that correspond to valid code points for the font.
+     * @param c character to decode
+     * @return Unicode character corresponding to the remapped code according to the font's current encoding.
+     * @throws Error if the the character is out of range
+     */
+    public String decode(char c) throws Error {
+        String result;
+        if (hasUnicodeCMAP()) {
+            result = toUnicodeCmap.lookup(c);
+        } else if (c <= 0xff) {
+            result = new String(cidbyte2uni, 0xff & c, 1);
+        } else {
+            throw new Error("Multi-byte glyphs not implemented yet");
+        }
+        return result;
+    }
 
 	/**
 	 * Encodes bytes to a String.

diff --git a/openpdf/src/main/java/com/lowagie/text/pdf/PdfString.java b/openpdf/src/main/java/com/lowagie/text/pdf/PdfString.java
@@ -225,13 +225,38 @@ void decrypt(PdfReader reader) {
             value = PdfEncodings.convertToString(bytes, null);
         }
     }
-
+
+    /**
+     * @return The original bytes used to create this PDF string, or the bytes of our current value
+     *         if the original bytes are missing.
+     */
     public byte[] getOriginalBytes() {
         if (originalValue == null)
             return getBytes();
         return PdfEncodings.convertToBytes(originalValue, null);
     }
 
+    /**
+     * return the characters in our value without any translation. This allows
+     * a string to be built that holds 2-byte or one-byte character codes, as needed
+     * for processing by fonts when extracting text.
+     * 
+     * Intended for use when no encoding transformations are desired.
+     * @return The code points in this font as chars.
+     */
+    public char[] getOriginalChars() {
+        char[] chars;
+        if (encoding == null || encoding.length() == 0) {
+            byte [] bytes = getOriginalBytes();
+            chars = new char[bytes.length];
+            for (int i = 0; i<bytes.length; i++)
+                chars[i] = (char) (bytes[i]&0xff);
+        } else {
+            chars = new char[0];
+        }
+        return chars;
+    }
+
     public PdfString setHexWriting(boolean hexWriting) {
         this.hexWriting = hexWriting;
         return this;

diff --git a/openpdf/src/main/java/com/lowagie/text/pdf/fonts/cmaps/CMap.java b/openpdf/src/main/java/com/lowagie/text/pdf/fonts/cmaps/CMap.java
@@ -78,6 +78,26 @@ public boolean hasTwoByteMappings()
         return !doubleByteMappings.isEmpty();
     }
 
+    /**
+     * This will perform a lookup into the map.
+     * 
+     * Some characters (e.g. ligatures) decode to character sequences.
+     *
+     * @param code The code used to lookup.
+     * @return The string that matches the lookup.
+     */
+    public String lookup(char code)
+    {
+        String result = null;
+        if (hasTwoByteMappings()) {
+            result = (String) doubleByteMappings.get(new Integer(code));
+        }
+        if (result == null && code <= 0xff && hasOneByteMappings()) {
+            result = (String) singleByteMappings.get(new Integer(code & 0xff));
+        }
+        return result;
+    }
+
     /**
      * This will perform a lookup into the map.
      *

diff --git a/openpdf/src/main/java/com/lowagie/text/pdf/parser/MarkedUpTextAssembler.java b/openpdf/src/main/java/com/lowagie/text/pdf/parser/MarkedUpTextAssembler.java
@@ -126,6 +126,7 @@ public void process(Word completed, String contextName) {
 	 */
 	private void clearAccumulator() {
 		for (TextAssemblyBuffer partialWord : partialWords) {
+		    // Visit each partialWord, calling renderText 
 			partialWord.assemble(this);
 		}
 		partialWords.clear();
@@ -201,67 +202,66 @@ public void renderText(FinalText finalText) {
 		result.add(finalText);
 	}
 
-	/**
-	 * Captures text using a simplified algorithm for inserting hard returns and
-	 * spaces
-	 *
-	 * @see com.lowagie.text.pdf.parser.AbstractRenderListener#renderText(java.lang.String,
-	 *      com.lowagie.text.pdf.parser.GraphicsState,
-	 *      com.lowagie.text.pdf.parser.Matrix,
-	 *      com.lowagie.text.pdf.parser.Matrix)
-	 */
-	@Override
-	public void renderText(ParsedTextImpl partialWord) {
-		if (partialWord.getText().trim().isEmpty()) {
-			return;
-		}
-		boolean firstRender = _inProgress == null;
-		boolean hardReturn = false;
-		if (firstRender) {
-			_inProgress = partialWord;
-			return;
-		}
-		Vector start = partialWord.getStartPoint();
-		Vector lastStart = _inProgress.getStartPoint();
-		Vector lastEnd = _inProgress.getEndPoint();
+    /**
+     * Captures text using a simplified algorithm for inserting hard returns and
+     * spaces
+     *
+     * @see com.lowagie.text.pdf.parser.AbstractRenderListener#renderText(java.lang.String,
+     *      com.lowagie.text.pdf.parser.GraphicsState,
+     *      com.lowagie.text.pdf.parser.Matrix,
+     *      com.lowagie.text.pdf.parser.Matrix)
+     */
+    @Override
+    public void renderText(ParsedTextImpl partialWord) {
+        boolean firstRender = _inProgress == null;
+        boolean hardReturn = false;
+        if (firstRender) {
+            _inProgress = partialWord;
+            return;
+        }
+        Vector start = partialWord.getStartPoint();
+        Vector lastStart = _inProgress.getStartPoint();
+        Vector lastEnd = _inProgress.getEndPoint();
 
-		// see
-		// http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
-		float dist = lastEnd.subtract(lastStart)
-				.cross(lastStart.subtract(start)).lengthSquared()
-				/ lastEnd.subtract(lastStart).lengthSquared();
+        // see
+        // http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
+        float dist = _inProgress.getBaseline().subtract(lastStart)
+                .cross(lastStart.subtract(start)).lengthSquared()
+                / _inProgress.getBaseline().subtract(lastStart).lengthSquared();
 
-		float sameLineThreshold = partialWord.getAscent() * 0.25f;
-		// let's try using 25% of current leading for vertical slop.
-		if (dist > sameLineThreshold) {
-			hardReturn = true;
-		}
-		/*
-		 * Note: Technically, we should check both the start and end positions,
-		 * in case the angle of the text changed without any displacement but
-		 * this sort of thing probably doesn't happen much in reality, so we'll
-		 * leave it alone for now
-		 */
-		float spacing = lastEnd.subtract(start).length();
-		if (hardReturn || partialWord.getText().startsWith(" ")) {
-			result.add(_inProgress.getFinalText(_reader, _page, this));
-			result.add(new FinalText("\n"));
-			if (_usePdfMarkupElements) {
-				result.add(new FinalText("<br class='t-pdf' />"));
-			}
-			_inProgress = partialWord;
-			// System.out.println("<< Hard Return >>");
-		} else if (spacing < partialWord.getSingleSpaceWidth() / 2.5) {
-			_inProgress = new Word(_inProgress.getText().trim()
-					+ partialWord.getText().trim(), partialWord.getAscent(),
-					partialWord.getDescent(), lastStart,
-					partialWord.getEndPoint(),
-					partialWord.getSingleSpaceWidth());
-		} else {
-			result.add(_inProgress.getFinalText(_reader, _page, this));
-			_inProgress = partialWord;
-		}
-	}
+        float sameLineThreshold = partialWord.getAscent() * 0.5f;
+        // let's try using 25% of current leading for vertical slop.
+        if (dist > sameLineThreshold||Float.isNaN(dist)) {
+            hardReturn = true;
+        }
+        /*
+         * Note: Technically, we should check both the start and end positions,
+         * in case the angle of the text changed without any displacement but
+         * this sort of thing probably doesn't happen much in reality, so we'll
+         * leave it alone for now
+         */
+        float spacing = lastEnd.subtract(start).length();
+        if (hardReturn || partialWord.breakBefore()) {
+            result.add(_inProgress.getFinalText(_reader, _page, this));
+            if (hardReturn) {
+                result.add(new FinalText("\n"));
+                if (_usePdfMarkupElements) {
+                    result.add(new FinalText("<br class='t-pdf' />"));
+                }
+            }
+            _inProgress = partialWord;
+            // System.out.println("<< Hard Return >>");
+        } else if (spacing < partialWord.getSingleSpaceWidth() / 2.3 || _inProgress.shouldNotSplit()) {
+            _inProgress = new Word(_inProgress.getText()
+                                   + partialWord.getText().trim(), partialWord.getAscent(),
+                                   partialWord.getDescent(), lastStart,
+                                   partialWord.getEndPoint(),
+                                   _inProgress.getBaseline(), partialWord.getSingleSpaceWidth(), _inProgress.shouldNotSplit(), _inProgress.breakBefore());
+        } else {
+            result.add(_inProgress.getFinalText(_reader, _page, this));
+            _inProgress = partialWord;
+        }
+    }
 
 	/**
 	 * Getter.