Skip to content

Commit

Permalink
Text extraction improvements and bug fixes
Browse files Browse the repository at this point in the history
Word extraction will respect spaces if they are present in strings
stored in the PDF, and not try to split or recombine them
inappropriately. Since most PDfs now have pretty logically-organized
content streams, the old graphical reassembly is less important.

Resulting word rectangles are now correct for more cases of odd PDFs.
Rotated pages are not yet handled, but media boxes with non-zero offsets
now work.

Font character decoding works for Identity-H encoding which is one form
of multi-byte encoding. The decoding has been reorganized as previously,
measuring was based on the characters themselves, not the code
points that were originally parsed. This meant that some measurements
would be off, e.g. in the case of ligatures (which return multiple
characters, but are only a single code point).

Some minor adjustments were made in CMap and CMapAware font handling so
that this could be done.
  • Loading branch information
daviddurand committed Apr 20, 2018
1 parent 4dc66b3 commit 0c341c8
Show file tree
Hide file tree
Showing 10 changed files with 697 additions and 256 deletions.
107 changes: 81 additions & 26 deletions openpdf/src/main/java/com/lowagie/text/pdf/CMapAwareDocumentFont.java
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ public int getWidth(int char1) {
* font's encoding.
*/
private String decodeSingleCID(byte[] bytes, int offset, int len) {
if (toUnicodeCmap != null) {
if (hasUnicodeCMAP()) {
if (offset + len > bytes.length) {
throw new ArrayIndexOutOfBoundsException(
MessageLocalization.getComposedMessage(
Expand All @@ -216,32 +216,87 @@ private String decodeSingleCID(byte[] bytes, int offset, int len) {
throw new Error("Multi-byte glyphs not implemented yet");
}

/**
* Decodes a string of bytes (encoded in the font's encoding) into a unicode
* string This will use the ToUnicode map of the font, if available,
* otherwise it uses the font's encoding
*
* @param cidbytes
* the bytes that need to be decoded
* @return the unicode String that results from decoding
* @since 2.1.7
*/
public String decode(byte[] cidbytes, final int offset, final int len) {
StringBuffer sb = new StringBuffer(); // it's a shame we can't make this
// StringBuilder
for (int i = offset; i < offset + len; i++) {
String rslt = decodeSingleCID(cidbytes, i, 1);
if (rslt == null && i + 1 < offset + len) {
rslt = decodeSingleCID(cidbytes, i, 2);
i++;
}
if (rslt != null) {
sb.append(rslt);
}
}
/**
* @return true if this font has unicode information available.
*/
public boolean hasUnicodeCMAP() {
return toUnicodeCmap != null;
}

return sb.toString();
}
/**
* Decodes a string of bytes (encoded in the font's encoding) into a unicode string. This will
* use the ToUnicode map of the font, if available, otherwise it uses the font's encoding
*
* @param cidbytes
* the bytes that need to be decoded
* @return the unicode String that results from decoding
* @since 2.1.7
*/
public String decode(byte[] cidbytes,
final int offset,
final int len) {
StringBuffer sb = new StringBuffer(); // it's a shame we can't make this
// StringBuilder
for (int i = offset; i < offset + len; i++ ) {
String rslt = decodeSingleCID(cidbytes, i, 1);
if (rslt == null && i + 1 < offset + len) {
rslt = decodeSingleCID(cidbytes, i, 2);
i++ ;
}
if (rslt != null) {
sb.append(rslt);
}
}

return sb.toString();
}

/**
* Decodes a string. This is a normal Java string, but if the range of character values
* exceeds the range of the encoding for the font, this will fail. Required since we need to
* process the characters of strings, and we can't determine the character boundaries in
* advance, especially because of Identity-H encoded fonts which have two-byte character
* indexes.
*
* PdfString is used to hold character code points, even though the bytes may not map 1-1. It's
* not possible to change the encoding once a string is in place.
*
* @param chars
* the Characters that need to be decoded
* @return the unicode String that results from decoding
* @since 2.1.
*/
public String decode(String chars) {
StringBuffer sb = new StringBuffer(); // it's a shame we can't make this
// StringBuilder
for (char c : chars.toCharArray()) {
String result = decode(c);
if (result != null) {
sb.append(result);
}
}

return sb.toString();
}

/**
* Decode single character whose value represents a code point in this font. Will fail if
* the characters do not have values that correspond to valid code points for the font.
* @param c character to decode
* @return Unicode character corresponding to the remapped code according to the font's current encoding.
* @throws Error if the the character is out of range
*/
public String decode(char c) throws Error {
String result;
if (hasUnicodeCMAP()) {
result = toUnicodeCmap.lookup(c);
} else if (c <= 0xff) {
result = new String(cidbyte2uni, 0xff & c, 1);
} else {
throw new Error("Multi-byte glyphs not implemented yet");
}
return result;
}

/**
* Encodes bytes to a String.
Expand Down
27 changes: 26 additions & 1 deletion openpdf/src/main/java/com/lowagie/text/pdf/PdfString.java
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,38 @@ void decrypt(PdfReader reader) {
value = PdfEncodings.convertToString(bytes, null);
}
}


/**
* @return The original bytes used to create this PDF string, or the bytes of our current value
* if the original bytes are missing.
*/
public byte[] getOriginalBytes() {
if (originalValue == null)
return getBytes();
return PdfEncodings.convertToBytes(originalValue, null);
}

/**
* return the characters in our value without any translation. This allows
* a string to be built that holds 2-byte or one-byte character codes, as needed
* for processing by fonts when extracting text.
*
* Intended for use when no encoding transformations are desired.
* @return The code points in this font as chars.
*/
public char[] getOriginalChars() {
char[] chars;
if (encoding == null || encoding.length() == 0) {
byte [] bytes = getOriginalBytes();
chars = new char[bytes.length];
for (int i = 0; i<bytes.length; i++)
chars[i] = (char) (bytes[i]&0xff);
} else {
chars = new char[0];
}
return chars;
}

public PdfString setHexWriting(boolean hexWriting) {
this.hexWriting = hexWriting;
return this;
Expand Down
20 changes: 20 additions & 0 deletions openpdf/src/main/java/com/lowagie/text/pdf/fonts/cmaps/CMap.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,26 @@ public boolean hasTwoByteMappings()
return !doubleByteMappings.isEmpty();
}

/**
* This will perform a lookup into the map.
*
* Some characters (e.g. ligatures) decode to character sequences.
*
* @param code The code used to lookup.
* @return The string that matches the lookup.
*/
public String lookup(char code)
{
String result = null;
if (hasTwoByteMappings()) {
result = (String) doubleByteMappings.get(new Integer(code));
}
if (result == null && code <= 0xff && hasOneByteMappings()) {
result = (String) singleByteMappings.get(new Integer(code & 0xff));
}
return result;
}

/**
* This will perform a lookup into the map.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ public void process(Word completed, String contextName) {
*/
private void clearAccumulator() {
for (TextAssemblyBuffer partialWord : partialWords) {
// Visit each partialWord, calling renderText
partialWord.assemble(this);
}
partialWords.clear();
Expand Down Expand Up @@ -201,67 +202,66 @@ public void renderText(FinalText finalText) {
result.add(finalText);
}

/**
* Captures text using a simplified algorithm for inserting hard returns and
* spaces
*
* @see com.lowagie.text.pdf.parser.AbstractRenderListener#renderText(java.lang.String,
* com.lowagie.text.pdf.parser.GraphicsState,
* com.lowagie.text.pdf.parser.Matrix,
* com.lowagie.text.pdf.parser.Matrix)
*/
@Override
public void renderText(ParsedTextImpl partialWord) {
if (partialWord.getText().trim().isEmpty()) {
return;
}
boolean firstRender = _inProgress == null;
boolean hardReturn = false;
if (firstRender) {
_inProgress = partialWord;
return;
}
Vector start = partialWord.getStartPoint();
Vector lastStart = _inProgress.getStartPoint();
Vector lastEnd = _inProgress.getEndPoint();
/**
* Captures text using a simplified algorithm for inserting hard returns and
* spaces
*
* @see com.lowagie.text.pdf.parser.AbstractRenderListener#renderText(java.lang.String,
* com.lowagie.text.pdf.parser.GraphicsState,
* com.lowagie.text.pdf.parser.Matrix,
* com.lowagie.text.pdf.parser.Matrix)
*/
@Override
public void renderText(ParsedTextImpl partialWord) {
boolean firstRender = _inProgress == null;
boolean hardReturn = false;
if (firstRender) {
_inProgress = partialWord;
return;
}
Vector start = partialWord.getStartPoint();
Vector lastStart = _inProgress.getStartPoint();
Vector lastEnd = _inProgress.getEndPoint();

// see
// http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
float dist = lastEnd.subtract(lastStart)
.cross(lastStart.subtract(start)).lengthSquared()
/ lastEnd.subtract(lastStart).lengthSquared();
// see
// http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
float dist = _inProgress.getBaseline().subtract(lastStart)
.cross(lastStart.subtract(start)).lengthSquared()
/ _inProgress.getBaseline().subtract(lastStart).lengthSquared();

float sameLineThreshold = partialWord.getAscent() * 0.25f;
// let's try using 25% of current leading for vertical slop.
if (dist > sameLineThreshold) {
hardReturn = true;
}
/*
* Note: Technically, we should check both the start and end positions,
* in case the angle of the text changed without any displacement but
* this sort of thing probably doesn't happen much in reality, so we'll
* leave it alone for now
*/
float spacing = lastEnd.subtract(start).length();
if (hardReturn || partialWord.getText().startsWith(" ")) {
result.add(_inProgress.getFinalText(_reader, _page, this));
result.add(new FinalText("\n"));
if (_usePdfMarkupElements) {
result.add(new FinalText("<br class='t-pdf' />"));
}
_inProgress = partialWord;
// System.out.println("<< Hard Return >>");
} else if (spacing < partialWord.getSingleSpaceWidth() / 2.5) {
_inProgress = new Word(_inProgress.getText().trim()
+ partialWord.getText().trim(), partialWord.getAscent(),
partialWord.getDescent(), lastStart,
partialWord.getEndPoint(),
partialWord.getSingleSpaceWidth());
} else {
result.add(_inProgress.getFinalText(_reader, _page, this));
_inProgress = partialWord;
}
}
float sameLineThreshold = partialWord.getAscent() * 0.5f;
// let's try using 25% of current leading for vertical slop.
if (dist > sameLineThreshold||Float.isNaN(dist)) {
hardReturn = true;
}
/*
* Note: Technically, we should check both the start and end positions,
* in case the angle of the text changed without any displacement but
* this sort of thing probably doesn't happen much in reality, so we'll
* leave it alone for now
*/
float spacing = lastEnd.subtract(start).length();
if (hardReturn || partialWord.breakBefore()) {
result.add(_inProgress.getFinalText(_reader, _page, this));
if (hardReturn) {
result.add(new FinalText("\n"));
if (_usePdfMarkupElements) {
result.add(new FinalText("<br class='t-pdf' />"));
}
}
_inProgress = partialWord;
// System.out.println("<< Hard Return >>");
} else if (spacing < partialWord.getSingleSpaceWidth() / 2.3 || _inProgress.shouldNotSplit()) {
_inProgress = new Word(_inProgress.getText()
+ partialWord.getText().trim(), partialWord.getAscent(),
partialWord.getDescent(), lastStart,
partialWord.getEndPoint(),
_inProgress.getBaseline(), partialWord.getSingleSpaceWidth(), _inProgress.shouldNotSplit(), _inProgress.breakBefore());
} else {
result.add(_inProgress.getFinalText(_reader, _page, this));
_inProgress = partialWord;
}
}

/**
* Getter.
Expand Down
Loading

0 comments on commit 0c341c8

Please sign in to comment.