Skip to content

Commit

Permalink
adding test for color extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
kreuzberger committed Feb 19, 2024
1 parent c285d7d commit d0c00ec
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 1 deletion.
17 changes: 17 additions & 0 deletions libpdf/models/horizontal_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class Char: # pylint: disable=too-few-public-methods # simplicity is good.
:vartype x1: float
:ivar y1: distance from the bottom of the page to the upper edge of the character (greater than y0)
:vartype y1: float
:ivar ncolor: non-stroking-color
:vartype ncolor: tuple of rgb or None
"""

def __init__(
Expand All @@ -26,13 +28,15 @@ def __init__(
y0: float = None,
x1: float = None,
y1: float = None,
ncolor: tuple = None,
):
"""Init the class with plain char of a character and its rectangular coordinates."""
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
self.text = text
self.ncolor = ncolor

def __repr__(self):
"""Make the text part of the repr for better debugging."""
Expand Down Expand Up @@ -63,13 +67,17 @@ def __init__(
self.x1 = x1
self.y1 = y1
self.chars = chars
self.ncolor = None
if self.chars:
# Obtain the rectangle coordinates from a list of libpdf text objects
self.x0 = min(text_obj.x0 for text_obj in self.chars)
self.y0 = min(text_obj.y0 for text_obj in self.chars)
self.x1 = max(text_obj.x1 for text_obj in self.chars)
self.y1 = max(text_obj.y1 for text_obj in self.chars)

if all(x.ncolor == self.chars[0].ncolor for x in self.chars):
self.ncolor = self.chars[0].ncolor

@property
def text(self):
"""Return plain text."""
Expand Down Expand Up @@ -157,6 +165,15 @@ def text(self):
"""Return plain text."""
return "\n".join([x.text for x in self.lines])

@property
def words(self):
"""Return array of words"""
_array = []
for l in self.lines:
_array += l.words

return _array

def __repr__(self):
"""Make the text part of the repr for better debugging."""
if self.lines:
Expand Down
9 changes: 8 additions & 1 deletion libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,14 @@ def assemble_to_textlines(
for lt_obj in flatten_lt_objs:
if lt_obj.get_text() != " " and lt_obj.get_text() != "\n":
# instantiate Char
char = Char(lt_obj.get_text(), lt_obj.x0, lt_obj.y0, lt_obj.x1, lt_obj.y1)
char = Char(
lt_obj.get_text(),
lt_obj.x0,
lt_obj.y0,
lt_obj.x1,
lt_obj.y1,
lt_obj.graphicstate.ncolor if hasattr(lt_obj, "graphicstate") else None,
)
chars.append(char)

if lt_obj is flatten_lt_objs[-1]:
Expand Down
22 changes: 22 additions & 0 deletions tests/test_word_colors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Test catalog extraction."""

import logging

import libpdf
from tests.conftest import PDF_OUTLINE_NO_DEST


def test_word_color_chapter() -> None:
"""Test word colors in given chapter paragraph."""
objects = libpdf.load(PDF_OUTLINE_NO_DEST)
assert objects is not None
assert objects.flattened.chapters

for chapter in objects.flattened.chapters:
if chapter.title == "Create Basic Shapes":
for content in chapter.content:
if content.type == "paragraph" and "Diamond" in content.textbox.text:
words = content.textbox.words
logging.debug("found words ", words)
for word in words:
assert word.ncolor == (0, 0, 1)

0 comments on commit d0c00ec

Please sign in to comment.