Skip to content

Commit

Permalink
Fix: Raise ValueError on crop w/ zero-overlap bbox
Browse files Browse the repository at this point in the history
h/t @samkit-jain for catching, per example in #245
  • Loading branch information
jsvine committed Aug 15, 2020
1 parent 3786b1a commit 81d167e
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ The `pdfplumber.Page` class is at the core of `pdfplumber`. Most things you'll d

| Method | Description |
|--------|-------------|
|`.crop(bounding_box, relative=False)`| Returns a version of the page cropped to the bounding box, which should be expressed as 4-tuple with the values `(x0, top, x1, bottom)`. Cropped pages retain objects that fall at least partly within the bounding box. If an object falls only partly within the box, its dimensions are sliced to fit the bounding box. If `relative=True`, the bounding box is calculated as an offset from the top-left of the page's bounding box, rather than an absolute positioning.|
|`.crop(bounding_box, relative=False)`| Returns a version of the page cropped to the bounding box, which should be expressed as 4-tuple with the values `(x0, top, x1, bottom)`. Cropped pages retain objects that fall at least partly within the bounding box. If an object falls only partly within the box, its dimensions are sliced to fit the bounding box. If `relative=True`, the bounding box is calculated as an offset from the top-left of the page's bounding box, rather than an absolute positioning. (See [Issue #245](https://github.com/jsvine/pdfplumber/issues/245) for a visual example and explanation.)|
|`.within_bbox(bounding_box, relative=False)`| Similar to `.crop`, but only retains objects that fall *entirely* within the bounding box.|
|`.filter(test_function)`| Returns a version of the page with only the `.objects` for which `test_function(obj)` returns `True`.|
|`.extract_text(x_tolerance=3, y_tolerance=3)`| Collates all of the page's character objects into a single string. Adds spaces where the difference between the `x1` of one character and the `x0` of the next is greater than `x_tolerance`. Adds newline characters where the difference between the `doctop` of one character and the `doctop` of the next is greater than `y_tolerance`.|
Expand Down
7 changes: 6 additions & 1 deletion pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,12 @@ def test_proposed_bbox(bbox, parent_bbox):
raise ValueError(f"Bounding box {bbox} has an area of zero.")

overlap = utils.get_bbox_overlap(bbox, parent_bbox)
if overlap is None:
raise ValueError(
f"Bounding box {bbox} is entirely outside "
f"parent page bounding box {parent_bbox}"
)

overlap_area = utils.calculate_area(overlap)
if overlap_area < bbox_area:
raise ValueError(
Expand All @@ -299,7 +305,6 @@ def test_proposed_bbox(bbox, parent_bbox):
class CroppedPage(DerivedPage):
def __init__(self, parent_page, bbox, crop_fn=utils.crop_to_bbox, relative=False):
if relative:
print("Parent page", parent_page.bbox)
o_x0, o_top, _, _ = parent_page.bbox
x0, top, x1, bottom = bbox
self.bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top)
Expand Down
28 changes: 20 additions & 8 deletions tests/test_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,31 +74,43 @@ def test(obj):
assert len(filtered.rects) == 0

def test_relative_crop(self):
original = self.pdf.pages[0]
cropped = original.crop((10, 10, 40, 40))
page = self.pdf.pages[0]
cropped = page.crop((10, 10, 40, 40))
recropped = cropped.crop((10, 15, 20, 25), relative=True)
target_bbox = pdfplumber.utils.decimalize((20, 25, 30, 35))
assert recropped.bbox == target_bbox

recropped_wi = cropped.within_bbox((10, 15, 20, 25), relative=True)
assert recropped_wi.bbox == target_bbox

# via issue #245, should not throw error when using `relative=True`
bottom = page.crop((0, 0.8 * float(page.height), page.width, page.height))
bottom_left = bottom.crop((0, 0, 0.5 * float(bottom.width), bottom.height), relative=True)
bottom_right = bottom.crop((0.5 * float(bottom.width), 0, bottom.width, bottom.height), relative=True)

def test_invalid_crops(self):
original = self.pdf.pages[0]
page = self.pdf.pages[0]
with pytest.raises(ValueError):
original.crop((0, 0, 0, 0))
page.crop((0, 0, 0, 0))

with pytest.raises(ValueError):
original.crop((0, 0, 10000, 10))
page.crop((0, 0, 10000, 10))

with pytest.raises(ValueError):
original.crop((-10, 0, 10, 10))
page.crop((-10, 0, 10, 10))

with pytest.raises(ValueError):
original.crop((100, 0, 0, 100))
page.crop((100, 0, 0, 100))

with pytest.raises(ValueError):
original.crop((0, 100, 100, 0))
page.crop((0, 100, 100, 0))

# via issue #245
bottom = page.crop((0, 0.8 * float(page.height), page.width, page.height))
with pytest.raises(ValueError):
bottom_left = bottom.crop((0, 0, 0.5 * float(bottom.width), bottom.height))
with pytest.raises(ValueError):
bottom_right = bottom.crop((0.5 * float(bottom.width), 0, bottom.width, bottom.height))

def test_rotation(self):
assert(self.pdf.pages[0].width == 1008)
Expand Down

0 comments on commit 81d167e

Please sign in to comment.