Skip to content

Commit

Permalink
Fix Page.get_textmap caching for extra_attrs=[...]
Browse files Browse the repository at this point in the history
... by preconverting list kwargs to tuples.
  • Loading branch information
jsvine committed Oct 27, 2023
1 parent 2e838d1 commit 0bfffc2
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 3 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/).

## [0.10.4] - [Unreleased]

### Fixed

- Fix `Page.get_textmap` caching to allow for `extra_attrs=[...]`, by preconverting list kwargs to tuples. ([#1030](https://github.com/jsvine/pdfplumber/issues/1030))

## [0.10.3] - 2023-10-26

### Added
Expand Down
23 changes: 20 additions & 3 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,23 @@ def paint_path(self, *args, **kwargs) -> None: # type: ignore
self.tag_cur_item()


def textmap_cacher(func: Callable[..., TextMap]) -> Callable[..., TextMap]:
"""
Caches the kwargs to Page._get_textmap
and converts list kwargs (which would be unhashable) into tuples.
"""

def new_func(**kwargs: Any) -> TextMap:
return lru_cache()(func)(
**{
key: (tuple(value) if isinstance(value, list) else value)
for key, value in kwargs.items()
}
)

return new_func


class Page(Container):
cached_properties: List[str] = Container.cached_properties + ["_layout"]
is_original: bool = True
Expand Down Expand Up @@ -211,8 +228,8 @@ def __init__(
)
)

# https://rednafi.github.io/reflections/dont-wrap-instance-methods-with-functoolslru_cache-decorator-in-python.html
self.get_textmap = lru_cache()(self._get_textmap)
# https://rednafi.com/python/lru_cache_on_methods/
self.get_textmap = textmap_cacher(self._get_textmap)

@property
def width(self) -> T_num:
Expand Down Expand Up @@ -569,7 +586,7 @@ def __init__(self, parent_page: Page):
self.page_obj = parent_page.page_obj
self.page_number = parent_page.page_number
self.flush_cache(Container.cached_properties)
self.get_textmap = lru_cache()(self._get_textmap)
self.get_textmap = textmap_cacher(self._get_textmap)


def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:
Expand Down
Binary file added tests/pdfs/extra-attrs-example.pdf
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,27 @@ def test_extract_words(self):
assert words_rtl[1]["text"] == "baaabaaA/AAA"
assert words_rtl[1]["direction"] == -1

def test_extra_attrs(self):
path = os.path.join(HERE, "pdfs/extra-attrs-example.pdf")
with pdfplumber.open(path) as pdf:
page = pdf.pages[0]
assert page.extract_text() == "BlackRedArial"
assert (
page.extract_text(extra_attrs=["non_stroking_color"])
== "Black RedArial"
)
assert page.extract_text(extra_attrs=["fontname"]) == "BlackRed Arial"
assert (
page.extract_text(extra_attrs=["non_stroking_color", "fontname"])
== "Black Red Arial"
)
# Should not error
assert page.extract_text(
layout=True,
use_text_flow=True,
extra_attrs=["non_stroking_color", "fontname"],
)

def test_extract_words_punctuation(self):
path = os.path.join(HERE, "pdfs/test-punkt.pdf")
with pdfplumber.open(path) as pdf:
Expand Down

0 comments on commit 0bfffc2

Please sign in to comment.