diff --git a/CHANGELOG.md b/CHANGELOG.md index 55fea4db..14d38067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). +## [0.10.4] - [Unreleased] + +### Fixed + +- Fix `Page.get_textmap` caching to allow for `extra_attrs=[...]`, by preconverting list kwargs to tuples. ([#1030](https://github.com/jsvine/pdfplumber/issues/1030)) + ## [0.10.3] - 2023-10-26 ### Added diff --git a/pdfplumber/page.py b/pdfplumber/page.py index c86a363c..fabfcd62 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -167,6 +167,23 @@ def paint_path(self, *args, **kwargs) -> None: # type: ignore self.tag_cur_item() +def textmap_cacher(func: Callable[..., TextMap]) -> Callable[..., TextMap]: + """ + Caches the kwargs to Page._get_textmap + and converts list kwargs (which would be unhashable) into tuples. + """ + + def new_func(**kwargs: Any) -> TextMap: + return lru_cache()(func)( + **{ + key: (tuple(value) if isinstance(value, list) else value) + for key, value in kwargs.items() + } + ) + + return new_func + + class Page(Container): cached_properties: List[str] = Container.cached_properties + ["_layout"] is_original: bool = True @@ -211,8 +228,8 @@ def __init__( ) ) - # https://rednafi.github.io/reflections/dont-wrap-instance-methods-with-functoolslru_cache-decorator-in-python.html - self.get_textmap = lru_cache()(self._get_textmap) + # https://rednafi.com/python/lru_cache_on_methods/ + self.get_textmap = textmap_cacher(self._get_textmap) @property def width(self) -> T_num: @@ -569,7 +586,7 @@ def __init__(self, parent_page: Page): self.page_obj = parent_page.page_obj self.page_number = parent_page.page_number self.flush_cache(Container.cached_properties) - self.get_textmap = lru_cache()(self._get_textmap) + self.get_textmap = textmap_cacher(self._get_textmap) def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None: diff --git a/tests/pdfs/extra-attrs-example.pdf b/tests/pdfs/extra-attrs-example.pdf new file mode 100644 index 00000000..1bd5d02a Binary files /dev/null and b/tests/pdfs/extra-attrs-example.pdf differ diff --git a/tests/test_utils.py b/tests/test_utils.py index cbd65a4b..3f64656c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -88,6 +88,27 @@ def test_extract_words(self): assert words_rtl[1]["text"] == "baaabaaA/AAA" assert words_rtl[1]["direction"] == -1 + def test_extra_attrs(self): + path = os.path.join(HERE, "pdfs/extra-attrs-example.pdf") + with pdfplumber.open(path) as pdf: + page = pdf.pages[0] + assert page.extract_text() == "BlackRedArial" + assert ( + page.extract_text(extra_attrs=["non_stroking_color"]) + == "Black RedArial" + ) + assert page.extract_text(extra_attrs=["fontname"]) == "BlackRed Arial" + assert ( + page.extract_text(extra_attrs=["non_stroking_color", "fontname"]) + == "Black Red Arial" + ) + # Should not error + assert page.extract_text( + layout=True, + use_text_flow=True, + extra_attrs=["non_stroking_color", "fontname"], + ) + def test_extract_words_punctuation(self): path = os.path.join(HERE, "pdfs/test-punkt.pdf") with pdfplumber.open(path) as pdf: