diff --git a/pdfplumber/utils/text.py b/pdfplumber/utils/text.py index d928c155..7a3163e1 100644 --- a/pdfplumber/utils/text.py +++ b/pdfplumber/utils/text.py @@ -145,6 +145,7 @@ def to_textmap( x_shift: T_num = 0, y_shift: T_num = 0, y_tolerance: T_num = DEFAULT_Y_TOLERANCE, + use_text_flow: bool = False, presorted: bool = False, expand_ligatures: bool = True, ) -> TextMap: @@ -213,17 +214,19 @@ def to_textmap( num_newlines = 0 - words_sorted = ( + words_sorted_doctop = ( self.tuples - if presorted - else sorted(self.tuples, key=lambda x: (x[0]["doctop"], x[0]["x0"])) + if presorted or use_text_flow + else sorted(self.tuples, key=lambda x: float(x[0]["doctop"])) ) - first_word = words_sorted[0][0] + first_word = words_sorted_doctop[0][0] doctop_start = first_word["doctop"] - first_word["top"] for i, ws in enumerate( - cluster_objects(words_sorted, lambda x: float(x[0]["doctop"]), y_tolerance) + cluster_objects( + words_sorted_doctop, lambda x: float(x[0]["doctop"]), y_tolerance + ) ): y_dist = ( (ws[0][0]["doctop"] - (doctop_start + y_shift)) / y_density @@ -245,7 +248,14 @@ def to_textmap( num_newlines += num_newlines_prepend line_len = 0 - for word, chars in sorted(ws, key=lambda x: float(x[0]["x0"])): + + line_words_sorted_x0 = ( + ws + if presorted or use_text_flow + else sorted(ws, key=lambda x: float(x[0]["x0"])) + ) + + for word, chars in line_words_sorted_x0: x_dist = (word["x0"] - x_shift) / x_density if layout else 0 num_spaces_prepend = max(min(1, line_len), round(x_dist) - line_len) _textmap += [(" ", None)] * num_spaces_prepend