Skip to content

Commit

Permalink
Fix converting path to multiple rectangles
Browse files Browse the repository at this point in the history
For path that consists of a series of rectangles
(shape is 'mlllhmlllh...'), call paint_path again with each group of
5 points. The result is multiple rects instead of a single curve.

fixes #369
  • Loading branch information
cheungpat committed Feb 6, 2020
1 parent bab6d15 commit 80dbb3b
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@


class PDFLayoutAnalyzer(PDFTextDevice):

RECTS = re.compile('^(mlllh)+$')

def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno
Expand Down Expand Up @@ -100,6 +103,12 @@ def paint_path(self, gstate, stroke, fill, evenodd, path):
stroke, fill, evenodd, gstate.scolor,
gstate.ncolor))
return
if self.RECTS.match(shape):
for path_group in zip(*(iter(path),) * 5):
self.paint_path(gstate, stroke, fill, evenodd,
list(path_group))
return

# other shapes
pts = []
for p in path:
Expand Down
Binary file added samples/contrib/excel.pdf
Binary file not shown.
3 changes: 3 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def test_contrib_hash_two_complement(self):
"""
run('contrib/issue-00352-hash-twos-complement.pdf')

def test_contrib_excel(self):
run('contrib/excel.pdf', '-t xml')


class TestDumpImages:

Expand Down

0 comments on commit 80dbb3b

Please sign in to comment.