-
Notifications
You must be signed in to change notification settings - Fork 708
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add convert.py/.to_json/.to_csv & improve testcov
Moves most of the logic previously in cli.py to convert.py, for usage by other submodules. Adds Container.to_json and Container.to_csv. Makes adjustments/fixes to other parts of the library, based on edge-cases encountered (such as infinite recursion in anntations).
- Loading branch information
Showing
11 changed files
with
296 additions
and
108 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,116 +1,50 @@ | ||
#!/usr/bin/env python | ||
import pdfplumber | ||
from . import convert | ||
from .pdf import PDF | ||
import argparse | ||
from itertools import chain | ||
|
||
try: | ||
from cdecimal import Decimal, ROUND_HALF_UP | ||
except ImportError: | ||
from decimal import Decimal, ROUND_HALF_UP | ||
import unicodecsv | ||
import codecs | ||
import json | ||
import sys | ||
|
||
|
||
class DecimalEncoder(json.JSONEncoder): | ||
def default(self, o): | ||
if isinstance(o, Decimal): | ||
return float(o.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP)) | ||
return super(DecimalEncoder, self).default(o) | ||
|
||
|
||
def parse_page_spec(p_str): | ||
if "-" in p_str: | ||
return list(range(*map(int, p_str.split("-")))) | ||
start, end = map(int, p_str.split("-")) | ||
return range(start, end + 1) | ||
else: | ||
return [int(p_str)] | ||
|
||
|
||
def parse_args(): | ||
def parse_args(args_raw): | ||
parser = argparse.ArgumentParser("pdfplumber") | ||
|
||
stdin = sys.stdin.buffer if sys.version_info[0] >= 3 else sys.stdin | ||
parser.add_argument( | ||
"infile", nargs="?", type=argparse.FileType("rb"), default=stdin | ||
"infile", nargs="?", type=argparse.FileType("rb"), default=sys.stdin.buffer | ||
) | ||
|
||
parser.add_argument("--format", choices=["csv", "json"], default="csv") | ||
|
||
parser.add_argument("--encoding", default="utf-8") | ||
|
||
TYPE_DEFAULTS = ["char", "anno", "line", "curve", "rect"] | ||
parser.add_argument( | ||
"--types", | ||
nargs="+", | ||
choices=TYPE_DEFAULTS + ["rect_edge"], | ||
default=TYPE_DEFAULTS, | ||
default=convert.DEFAULT_TYPES, | ||
choices=convert.DEFAULT_TYPES, | ||
) | ||
|
||
parser.add_argument("--pages", nargs="+", type=parse_page_spec) | ||
|
||
args = parser.parse_args() | ||
parser.add_argument( | ||
"--indent", type=int, help="Indent level for JSON pretty-printing." | ||
) | ||
|
||
args = parser.parse_args(args_raw) | ||
if args.pages is not None: | ||
args.pages = list(chain(*args.pages)) | ||
return args | ||
|
||
|
||
def to_csv(pdf, types, encoding): | ||
objs = [] | ||
fields = set() | ||
for t in types: | ||
new_objs = getattr(pdf, t + "s") | ||
if len(new_objs): | ||
objs += new_objs | ||
fields = fields.union(set(new_objs[0].keys())) | ||
|
||
first_columns = [ | ||
"object_type", | ||
"page_number", | ||
"x0", | ||
"x1", | ||
"y0", | ||
"y1", | ||
"doctop", | ||
"top", | ||
"bottom", | ||
"width", | ||
"height", | ||
] | ||
|
||
cols = first_columns + list(sorted(set(fields) - set(first_columns))) | ||
stdout = sys.stdout.buffer if sys.version_info[0] >= 3 else sys.stdout | ||
w = unicodecsv.DictWriter(stdout, fieldnames=cols, encoding=encoding) | ||
w.writeheader() | ||
w.writerows(objs) | ||
|
||
|
||
def to_json(pdf, types, encoding): | ||
data = {"metadata": pdf.metadata} | ||
|
||
def get_page_data(page): | ||
d = dict((t + "s", getattr(page, t + "s")) for t in types) | ||
d["width"] = page.width | ||
d["height"] = page.height | ||
return d | ||
|
||
data["pages"] = list(map(get_page_data, pdf.pages)) | ||
|
||
if hasattr(sys.stdout, "buffer"): | ||
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.buffer, "strict") | ||
json.dump(data, sys.stdout, cls=DecimalEncoder) | ||
else: | ||
json.dump(data, sys.stdout, cls=DecimalEncoder, encoding=encoding) | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
pdf = pdfplumber.open(args.infile, pages=args.pages) | ||
if args.format == "csv": | ||
to_csv(pdf, args.types, args.encoding) | ||
else: | ||
to_json(pdf, args.types, args.encoding) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
def main(args_raw=sys.argv[1:]): | ||
args = parse_args(args_raw) | ||
converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format] | ||
kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format] | ||
with PDF.open(args.infile, pages=args.pages) as pdf: | ||
converter(pdf, sys.stdout, args.types, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
from .utils import decode_text | ||
from decimal import Decimal, ROUND_HALF_UP | ||
from pdfminer.pdftypes import PDFStream, PDFObjRef | ||
from pdfminer.psparser import PSLiteral | ||
import json | ||
import csv | ||
import base64 | ||
from io import StringIO | ||
|
||
DEFAULT_TYPES = [ | ||
"char", | ||
"rect", | ||
"line", | ||
"curve", | ||
"image", | ||
"annot", | ||
] | ||
|
||
COLS_TO_PREPEND = [ | ||
"object_type", | ||
"page_number", | ||
"x0", | ||
"x1", | ||
"y0", | ||
"y1", | ||
"doctop", | ||
"top", | ||
"bottom", | ||
"width", | ||
"height", | ||
] | ||
|
||
ENCODINGS_TO_TRY = [ | ||
"utf-8", | ||
"latin-1", | ||
"utf-16", | ||
"utf-16le", | ||
] | ||
|
||
|
||
def to_b64(data_bytes): | ||
return base64.b64encode(data_bytes).decode("ascii") | ||
|
||
|
||
def serialize(obj): | ||
# Convert int-like | ||
t = type(obj) | ||
if t is Decimal: | ||
return float(obj.quantize(Decimal(".0001"), rounding=ROUND_HALF_UP)) | ||
# If tuple/list passed, bulk-convert | ||
elif t in (list, tuple): | ||
return t(serialize(x) for x in obj) | ||
elif t is dict: | ||
return {k: serialize(v) for k, v in obj.items()} | ||
elif t is PDFStream: | ||
return {"rawdata": to_b64(obj.rawdata)} | ||
elif t is PSLiteral: | ||
return decode_text(obj.name) | ||
elif t is bytes: | ||
try: | ||
for e in ENCODINGS_TO_TRY: | ||
return obj.decode(e) | ||
# If none of the decodings work, raise whatever error | ||
# decoding with utf-8 causes | ||
except: # pragma: no cover | ||
obj.decode(ENCODINGS_TO_TRY[0]) | ||
elif obj is None: | ||
return None | ||
elif t in (int, float, str, bool): | ||
return obj | ||
else: | ||
return str(obj) | ||
|
||
|
||
def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None): | ||
def page_to_dict(page): | ||
d = { | ||
"page_number": page.page_number, | ||
"initial_doctop": page.initial_doctop, | ||
"rotation": page.rotation, | ||
"cropbox": page.cropbox, | ||
"mediabox": page.mediabox, | ||
"bbox": page.bbox, | ||
"width": page.width, | ||
"height": page.height, | ||
} | ||
for t in types: | ||
d[t + "s"] = getattr(page, t + "s") | ||
return d | ||
|
||
if hasattr(container, "pages"): | ||
data = { | ||
"metadata": container.metadata, | ||
"pages": list(map(page_to_dict, container.pages)), | ||
} | ||
else: | ||
data = page_to_dict(container) | ||
|
||
serialized = serialize(data) | ||
|
||
if stream is None: | ||
return json.dumps(serialized, indent=indent) | ||
else: | ||
return json.dump(serialized, stream, indent=indent) | ||
|
||
|
||
def to_csv(container, stream=None, types=DEFAULT_TYPES): | ||
if stream is None: | ||
stream = StringIO() | ||
to_string = True | ||
else: | ||
to_string = False | ||
|
||
objs = [] | ||
|
||
# Determine set of fields for all objects | ||
fields = set() | ||
for t in types: | ||
new_objs = getattr(container, t + "s") | ||
if len(new_objs): | ||
objs += new_objs | ||
new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict] | ||
fields = fields.union(set(new_keys)) | ||
|
||
cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND))) | ||
|
||
w = csv.DictWriter(stream, fieldnames=cols, extrasaction="ignore") | ||
w.writeheader() | ||
w.writerows(serialize(objs)) | ||
if to_string: | ||
stream.seek(0) | ||
return stream.read() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.