From 81ae77fe9dee2d91d30eee2e375b5cda220c8272 Mon Sep 17 00:00:00 2001 From: "Charles K. Neimog" Date: Sun, 30 Jul 2023 16:14:54 -0300 Subject: [PATCH] Export annotation color as hex RGB in JSON (#78) * add color when using json * minor fixes --------- Co-authored-by: Andrew Baumann --- pdfannots/__init__.py | 14 ++++++++++++-- pdfannots/printer/json.py | 3 +++ pdfannots/types.py | 17 ++++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py index da4d6f5..007cf5d 100644 --- a/pdfannots/__init__.py +++ b/pdfannots/__init__.py @@ -22,7 +22,7 @@ import pdfminer.settings import pdfminer.utils -from .types import Page, Outline, AnnotationType, Annotation, Document +from .types import Page, Outline, AnnotationType, Annotation, Document, RGB from .utils import cleanup_text, decode_datetime pdfminer.settings.STRICT = False @@ -71,6 +71,16 @@ def _mkannotation( # decode as string, normalise line endings, replace special characters contents = cleanup_text(pdfminer.utils.decode_text(contents)) + rgb: typ.Optional[RGB] = None + color = pa.get('C') + if color is not None: + if (isinstance(color, list) + and len(color) == 3 + and all(isinstance(e, float) and 0 <= e <= 1 for e in color)): + rgb = RGB(*color) + else: + logger.warning("Invalid color %s in annotation on %s", color, page) + # Rect defines the location of the annotation on the page rect = pdftypes.resolve1(pa.get('Rect')) @@ -94,7 +104,7 @@ def _mkannotation( created = decode_datetime(createds) return Annotation(page, annot_type, quadpoints, rect, - contents, author=author, created=created) + contents, author=author, created=created, color=rgb) def _get_outlines(doc: PDFDocument) -> typ.Iterator[Outline]: diff --git a/pdfannots/printer/json.py b/pdfannots/printer/json.py index 13e128c..fc0b2c5 100644 --- a/pdfannots/printer/json.py +++ b/pdfannots/printer/json.py @@ -35,6 +35,9 @@ def annot_to_dict( if annot.created: result['created'] = annot.created.strftime('%Y-%m-%dT%H:%M:%S') + if annot.color: + result['color'] = annot.color.ashex() + return result diff --git a/pdfannots/types.py b/pdfannots/types.py index 2f2dc82..b957a97 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -276,6 +276,7 @@ class Annotation(ObjectWithPos): text Text in the order captured (use gettext() for a cleaner form) author Author of the annotation created Timestamp the annotation was created + color RGB color of the annotation last_charseq Sequence number of the most recent character in text Attributes updated only for StrikeOut annotations: @@ -297,7 +298,8 @@ def __init__( rect: typ.Optional[BoxCoords] = None, contents: typ.Optional[str] = None, author: typ.Optional[str] = None, - created: typ.Optional[datetime.datetime] = None): + created: typ.Optional[datetime.datetime] = None, + color: typ.Optional[RGB] = None): # Construct boxes from quadpoints boxes = [] @@ -324,6 +326,7 @@ def __init__( self.author = author self.created = created self.text = [] + self.color = color self.pre_context = None self.post_context = None self.boxes = boxes @@ -479,3 +482,15 @@ def nearest_outline( return page.outlines[idx - 1] return None + + +class RGB(typ.NamedTuple): + red: float + green: float + blue: float + + def ashex(self) -> str: + red_hex = format(int(self.red * 255), '02x') + green_hex = format(int(self.green * 255), '02x') + blue_hex = format(int(self.blue * 255), '02x') + return f"#{str(red_hex)}{str(green_hex)}{(blue_hex)}"