Skip to content

Commit cbc91c6

Browse files
committed
Add convert.py/.to_json/.to_csv & improve testcov
Moves most of the logic previously in cli.py to convert.py, for usage by other submodules. Adds Container.to_json and Container.to_csv. Makes adjustments/fixes to other parts of the library, based on edge-cases encountered (such as infinite recursion in anntations).
1 parent 3f4b4b2 commit cbc91c6

File tree

11 files changed

+296
-108
lines changed

11 files changed

+296
-108
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ The output will be a CSV containing info about every character, line, and rectan
4141

4242
| Argument | Description |
4343
|----------|-------------|
44-
|`--format [format]`| `csv` or `json`. The `json` format returns slightly more information; it includes PDF-level metadata and height/width information about each page.|
44+
|`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.|
4545
|`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
46-
|`--types [list of object types to extract]`| Choices are `char`, `line`, `curve`, `rect`, `rect_edge`. Defaults to `char`, `line`, `curve`, `rect`.|
46+
|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`. Defaults to all.|
4747

4848
## Python library
4949

pdfplumber/cli.py

100755100644
Lines changed: 19 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,50 @@
11
#!/usr/bin/env python
2-
import pdfplumber
2+
from . import convert
3+
from .pdf import PDF
34
import argparse
45
from itertools import chain
5-
6-
try:
7-
from cdecimal import Decimal, ROUND_HALF_UP
8-
except ImportError:
9-
from decimal import Decimal, ROUND_HALF_UP
10-
import unicodecsv
11-
import codecs
12-
import json
136
import sys
147

158

16-
class DecimalEncoder(json.JSONEncoder):
17-
def default(self, o):
18-
if isinstance(o, Decimal):
19-
return float(o.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP))
20-
return super(DecimalEncoder, self).default(o)
21-
22-
239
def parse_page_spec(p_str):
2410
if "-" in p_str:
25-
return list(range(*map(int, p_str.split("-"))))
11+
start, end = map(int, p_str.split("-"))
12+
return range(start, end + 1)
2613
else:
2714
return [int(p_str)]
2815

2916

30-
def parse_args():
17+
def parse_args(args_raw):
3118
parser = argparse.ArgumentParser("pdfplumber")
3219

33-
stdin = sys.stdin.buffer if sys.version_info[0] >= 3 else sys.stdin
3420
parser.add_argument(
35-
"infile", nargs="?", type=argparse.FileType("rb"), default=stdin
21+
"infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer
3622
)
3723

3824
parser.add_argument("--format", choices=["csv", "json"], default="csv")
3925

40-
parser.add_argument("--encoding", default="utf-8")
41-
42-
TYPE_DEFAULTS = ["char", "anno", "line", "curve", "rect"]
4326
parser.add_argument(
4427
"--types",
4528
nargs="+",
46-
choices=TYPE_DEFAULTS + ["rect_edge"],
47-
default=TYPE_DEFAULTS,
29+
default=convert.DEFAULT_TYPES,
30+
choices=convert.DEFAULT_TYPES,
4831
)
4932

5033
parser.add_argument("--pages", nargs="+", type=parse_page_spec)
5134

52-
args = parser.parse_args()
35+
parser.add_argument(
36+
"--indent", type=int, help="Indent level for JSON pretty-printing."
37+
)
38+
39+
args = parser.parse_args(args_raw)
5340
if args.pages is not None:
5441
args.pages = list(chain(*args.pages))
5542
return args
5643

5744

58-
def to_csv(pdf, types, encoding):
59-
objs = []
60-
fields = set()
61-
for t in types:
62-
new_objs = getattr(pdf, t + "s")
63-
if len(new_objs):
64-
objs += new_objs
65-
fields = fields.union(set(new_objs[0].keys()))
66-
67-
first_columns = [
68-
"object_type",
69-
"page_number",
70-
"x0",
71-
"x1",
72-
"y0",
73-
"y1",
74-
"doctop",
75-
"top",
76-
"bottom",
77-
"width",
78-
"height",
79-
]
80-
81-
cols = first_columns + list(sorted(set(fields) - set(first_columns)))
82-
stdout = sys.stdout.buffer if sys.version_info[0] >= 3 else sys.stdout
83-
w = unicodecsv.DictWriter(stdout, fieldnames=cols, encoding=encoding)
84-
w.writeheader()
85-
w.writerows(objs)
86-
87-
88-
def to_json(pdf, types, encoding):
89-
data = {"metadata": pdf.metadata}
90-
91-
def get_page_data(page):
92-
d = dict((t + "s", getattr(page, t + "s")) for t in types)
93-
d["width"] = page.width
94-
d["height"] = page.height
95-
return d
96-
97-
data["pages"] = list(map(get_page_data, pdf.pages))
98-
99-
if hasattr(sys.stdout, "buffer"):
100-
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, "strict")
101-
json.dump(data, sys.stdout, cls=DecimalEncoder)
102-
else:
103-
json.dump(data, sys.stdout, cls=DecimalEncoder, encoding=encoding)
104-
105-
106-
def main():
107-
args = parse_args()
108-
pdf = pdfplumber.open(args.infile, pages=args.pages)
109-
if args.format == "csv":
110-
to_csv(pdf, args.types, args.encoding)
111-
else:
112-
to_json(pdf, args.types, args.encoding)
113-
114-
115-
if __name__ == "__main__":
116-
main()
45+
def main(args_raw=sys.argv[1:]):
46+
args = parse_args(args_raw)
47+
converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format]
48+
kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format]
49+
with PDF.open(args.infile, pages=args.pages) as pdf:
50+
converter(pdf, sys.stdout, args.types, **kwargs)

pdfplumber/container.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from itertools import chain
2-
from . import utils
2+
from . import utils, convert
33

44

55
class Container(object):
@@ -64,3 +64,7 @@ def test(x):
6464
return x["orientation"] == "v"
6565

6666
return list(filter(test, self.edges))
67+
68+
69+
Container.to_json = convert.to_json
70+
Container.to_csv = convert.to_csv

pdfplumber/convert.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from .utils import decode_text
2+
from decimal import Decimal, ROUND_HALF_UP
3+
from pdfminer.pdftypes import PDFStream, PDFObjRef
4+
from pdfminer.psparser import PSLiteral
5+
import json
6+
import csv
7+
import base64
8+
from io import StringIO
9+
10+
DEFAULT_TYPES = [
11+
"char",
12+
"rect",
13+
"line",
14+
"curve",
15+
"image",
16+
"annot",
17+
]
18+
19+
COLS_TO_PREPEND = [
20+
"object_type",
21+
"page_number",
22+
"x0",
23+
"x1",
24+
"y0",
25+
"y1",
26+
"doctop",
27+
"top",
28+
"bottom",
29+
"width",
30+
"height",
31+
]
32+
33+
ENCODINGS_TO_TRY = [
34+
"utf-8",
35+
"latin-1",
36+
"utf-16",
37+
"utf-16le",
38+
]
39+
40+
41+
def to_b64(data_bytes):
42+
return base64.b64encode(data_bytes).decode("ascii")
43+
44+
45+
def serialize(obj):
46+
# Convert int-like
47+
t = type(obj)
48+
if t is Decimal:
49+
return float(obj.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP))
50+
# If tuple/list passed, bulk-convert
51+
elif t in (list, tuple):
52+
return t(serialize(x) for x in obj)
53+
elif t is dict:
54+
return {k: serialize(v) for k, v in obj.items()}
55+
elif t is PDFStream:
56+
return {"rawdata": to_b64(obj.rawdata)}
57+
elif t is PSLiteral:
58+
return decode_text(obj.name)
59+
elif t is bytes:
60+
try:
61+
for e in ENCODINGS_TO_TRY:
62+
return obj.decode(e)
63+
# If none of the decodings work, raise whatever error
64+
# decoding with utf-8 causes
65+
except: # pragma: no cover
66+
obj.decode(ENCODINGS_TO_TRY[0])
67+
elif obj is None:
68+
return None
69+
elif t in (int, float, str, bool):
70+
return obj
71+
else:
72+
return str(obj)
73+
74+
75+
def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None):
76+
def page_to_dict(page):
77+
d = {
78+
"page_number": page.page_number,
79+
"initial_doctop": page.initial_doctop,
80+
"rotation": page.rotation,
81+
"cropbox": page.cropbox,
82+
"mediabox": page.mediabox,
83+
"bbox": page.bbox,
84+
"width": page.width,
85+
"height": page.height,
86+
}
87+
for t in types:
88+
d[t + "s"] = getattr(page, t + "s")
89+
return d
90+
91+
if hasattr(container, "pages"):
92+
data = {
93+
"metadata": container.metadata,
94+
"pages": list(map(page_to_dict, container.pages)),
95+
}
96+
else:
97+
data = page_to_dict(container)
98+
99+
serialized = serialize(data)
100+
101+
if stream is None:
102+
return json.dumps(serialized, indent=indent)
103+
else:
104+
return json.dump(serialized, stream, indent=indent)
105+
106+
107+
def to_csv(container, stream=None, types=DEFAULT_TYPES):
108+
if stream is None:
109+
stream = StringIO()
110+
to_string = True
111+
else:
112+
to_string = False
113+
114+
objs = []
115+
116+
# Determine set of fields for all objects
117+
fields = set()
118+
for t in types:
119+
new_objs = getattr(container, t + "s")
120+
if len(new_objs):
121+
objs += new_objs
122+
new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
123+
fields = fields.union(set(new_keys))
124+
125+
cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND)))
126+
127+
w = csv.DictWriter(stream, fieldnames=cols, extrasaction="ignore")
128+
w.writeheader()
129+
w.writerows(serialize(objs))
130+
if to_string:
131+
stream.seek(0)
132+
return stream.read()

pdfplumber/page.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from .utils import resolve, resolve_all
33
from .table import TableFinder
44
from .container import Container
5-
65
import re
76

87
lt_pat = re.compile(r"^LT")
@@ -60,30 +59,45 @@ def layout(self):
6059
@property
6160
def annots(self):
6261
def parse(annot):
63-
data = resolve(annot.resolve())
64-
rect = self.decimalize(resolve_all(data["Rect"]))
62+
rect = self.decimalize(annot["Rect"])
63+
64+
a = annot.get("A", {})
65+
extras = {
66+
"uri": a.get("URI"),
67+
"title": annot.get("T"),
68+
"contents": annot.get("Contents"),
69+
}
70+
for k, v in extras.items():
71+
if v is not None:
72+
extras[k] = v.decode("utf-8")
73+
6574
parsed = {
6675
"page_number": self.page_number,
76+
"object_type": "annot",
77+
"x0": rect[0],
78+
"y0": rect[1],
79+
"x1": rect[2],
80+
"y1": rect[3],
6781
"doctop": self.initial_doctop + self.height - rect[3],
6882
"top": self.height - rect[3],
69-
"x0": rect[0],
7083
"bottom": self.height - rect[1],
71-
"x1": rect[2],
7284
"width": rect[2] - rect[0],
7385
"height": rect[3] - rect[1],
74-
"data": data,
7586
}
76-
uri = data.get("A", {}).get("URI")
77-
if uri is not None:
78-
parsed["URI"] = uri.decode("utf-8")
87+
parsed.update(extras)
88+
# Replace the indirect reference to the page dictionary
89+
# with a pointer to our actual page
90+
if "P" in annot:
91+
annot["P"] = self
92+
parsed["data"] = annot
7993
return parsed
8094

81-
raw = resolve(self.page_obj.annots) or []
95+
raw = resolve_all(self.page_obj.annots) or []
8296
return list(map(parse, raw))
8397

8498
@property
8599
def hyperlinks(self):
86-
return [a for a in self.annots if "URI" in a]
100+
return [a for a in self.annots if a["uri"] is not None]
87101

88102
@property
89103
def objects(self):
@@ -246,6 +260,9 @@ def to_image(self, **conversion_kwargs):
246260
kwargs["resolution"] = DEFAULT_RESOLUTION
247261
return PageImage(self, **kwargs)
248262

263+
def __repr__(self):
264+
return f"<Page:{self.page_number}>"
265+
249266

250267
class DerivedPage(Page):
251268
is_original = False

0 commit comments

Comments
 (0)