diff --git a/CHANGELOG.md b/CHANGELOG.md index e35fb679..4869a170 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). -## [0.5.29] - [unreleased] +## [0.6.0] - [unreleased] ### Added - Add `utils.merge_bboxes(bboxes)`, which returns the smallest bounding box that contains all bounding boxes in the `bboxes` argument. ## Changed +- Upgrade `pdfminer.six` from `20200517` to `20211012`; see [that library's changelog](https://github.com/pdfminer/pdfminer.six/blob/develop/CHANGELOG.md) for details, but a key difference is an improvement in how it assigns `line`, `rect`, and `curve` objects. (Diagonal two-point lines, for instance, are now `line` objects instead of `curve` objects.) - Change behavior of horizontal `text_strategy`, so that it uses the top and bottom of *every* word, not just the top of every word and the bottom of the last. ([#467](https://github.com/jsvine/pdfplumber/pull/467) + [#466](https://github.com/jsvine/pdfplumber/issues/466) + [#265](https://github.com/jsvine/pdfplumber/issues/265)) [h/t @bobluda + @samkit-jain] ### Fixed diff --git a/requirements.txt b/requirements.txt index 20779c53..ff3250e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -pdfminer.six==20200517 +pdfminer.six==20211012 Pillow>=7.0.0 Wand diff --git a/tests/test_convert.py b/tests/test_convert.py index be09d759..8b64e6f4 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -55,7 +55,7 @@ def test_additional_attr_types(self): def test_csv(self): c = self.pdf.to_csv() - assert c.split("\r\n")[2] == ( + assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,TimesNewRomanPSMT,,,,"(0, 0, 0)",,,18.0,,,,,Y,,1,' ) @@ -68,7 +68,7 @@ def test_csv(self): def test_csv_all_types(self): c = self.pdf.to_csv(types=None) - assert c.split("\r\n")[1].split(",")[0] == "curve" + assert c.split("\r\n")[1].split(",")[0] == "line" def test_cli(self): res = run( diff --git a/tests/test_issues.py b/tests/test_issues.py index e143c391..d04d8522 100644 --- a/tests/test_issues.py +++ b/tests/test_issues.py @@ -38,7 +38,7 @@ def filter_rects(rects): rects_found.append(rect) return rects_found - def determine_if_checked(checkbox, curve_list): + def determine_if_checked(checkbox, checklines): """ This figures out if the bounding box of (either) line used to make one half of the 'x' is the right size and overlaps with a rectangle. @@ -49,7 +49,7 @@ def determine_if_checked(checkbox, curve_list): But here we only test there's at least one. """ - for curve in curve_list: + for cl in checklines: if ( checkbox["height"] > (RECT_HEIGHT - RECT_TOLERANCE) @@ -61,13 +61,9 @@ def determine_if_checked(checkbox, curve_list): xmatch = False ymatch = False - if max(checkbox["x0"], curve["x0"]) <= min( - checkbox["x1"], curve["x1"] - ): + if max(checkbox["x0"], cl["x0"]) <= min(checkbox["x1"], cl["x1"]): xmatch = True - if max(checkbox["y0"], curve["y0"]) <= min( - checkbox["y1"], curve["y1"] - ): + if max(checkbox["y0"], cl["y0"]) <= min(checkbox["y1"], cl["y1"]): ymatch = True if xmatch and ymatch: return True @@ -75,10 +71,12 @@ def determine_if_checked(checkbox, curve_list): return False p0 = pdf.pages[0] - curves = p0.objects["curve"] + checklines = [ + line for line in p0.lines if line["height"] == line["width"] + ] # These are diagonals rects = filter_rects(p0.objects["rect"]) - n_checked = sum([determine_if_checked(rect, curves) for rect in rects]) + n_checked = sum([determine_if_checked(rect, checklines) for rect in rects]) assert n_checked == 5 pdf.close()