Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Two minor changes #610

Merged
merged 9 commits into from
Aug 31, 2021
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
- Raising `UnboundLocalError` when a bad `--output-type` is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
- `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))

## Removed
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
Expand Down
16 changes: 12 additions & 4 deletions pdfminer/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,33 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
imagewriter = ImageWriter(output_dir)

rsrcmgr = PDFResourceManager(caching=not disable_caching)
device = None

if output_type != 'text' and outfp == sys.stdout:
outfp = sys.stdout.buffer

if output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)

if outfp == sys.stdout:
outfp = sys.stdout.buffer

if output_type == 'xml':
elif output_type == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=strip_control)

elif output_type == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)

elif output_type == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)

else:
msg = f"Output type can be text, html, xml or tag but is " \
f"{output_type}"
raise ValueError(msg)

interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf,
page_numbers,
Expand Down
20 changes: 12 additions & 8 deletions pdfminer/pdfdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,40 +154,44 @@ def render_string(self, textstate, seq, ncs, graphicstate):
char = font.to_unichr(cid)
text += char
except PDFUnicodeNotDefined:
print(chars)
pass
self.outfp.write(utils.enc(text))
self._write(utils.enc(text))
return

def begin_page(self, page, ctm):
output = '<page id="%s" bbox="%s" rotate="%d">' %\
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
self.outfp.write(utils.make_compat_bytes(output))
self._write(output)
return

def end_page(self, page):
self.outfp.write(utils.make_compat_bytes('</page>\n'))
self._write('</page>\n')
self.pageno += 1
return

def begin_tag(self, tag, props=None):
s = ''
if isinstance(props, dict):
s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v)))
for (k, v) in sorted(props.items()))
s = ''.join([
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
for (k, v) in sorted(props.items())
])
out_s = '<{}{}>'.format(utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s))
self._write(out_s)
self._stack.append(tag)
return

def end_tag(self):
assert self._stack, str(self.pageno)
tag = self._stack.pop(-1)
out_s = '</%s>' % utils.enc(tag.name)
self.outfp.write(utils.make_compat_bytes(out_s))
self._write(out_s)
return

def do_tag(self, tag, props=None):
self.begin_tag(tag, props)
self._stack.pop(-1)
return

def _write(self, s: str):
self.outfp.write(s.encode(self.codec))
14 changes: 7 additions & 7 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,13 @@ def make_compat_bytes(in_str):
return in_str.encode()


def make_compat_str(in_str):
"""Converts to string, guessing encoding."""
assert isinstance(in_str, (bytes, str)), str(type(in_str))
if isinstance(in_str, bytes):
enc = chardet.detect(in_str)
in_str = in_str.decode(enc['encoding'])
return in_str
def make_compat_str(o):
"""Converts everything to string, if bytes guessing the encoding."""
if isinstance(o, bytes):
enc = chardet.detect(o)
return o.decode(enc['encoding'])
else:
return str(o)


def shorten_str(s, size):
Expand Down