Skip to content

Commit

Permalink
Sort tables extracted on a page by their top position
Browse files Browse the repository at this point in the history
Fixes #336
h/t @gqh1995 for reporting
  • Loading branch information
samkit-jain committed Jan 19, 2021
1 parent de767c3 commit d3c6034
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pdfplumber/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,11 @@ def __init__(self, page, settings={}):
)
self.cells = intersections_to_cells(self.intersections)
self.tables = [Table(self.page, t) for t in cells_to_tables(self.cells)]
# If multiple tables are found, sort them by their vertical placement.
# Top to bottom, left to right.
self.tables = sorted(
self.tables, key=lambda table: (table.bbox[1], table.bbox[0])
)

def get_edges(self):
settings = self.settings
Expand Down
Binary file added tests/pdfs/issue-336-example.pdf
Binary file not shown.
12 changes: 12 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,15 @@ def test_explicit_desc_decimalization(self):
def test_text_without_words(self):
assert table.words_to_edges_h([]) == []
assert table.words_to_edges_v([]) == []

def test_order(self):
"""
See issue #336
"""
path = os.path.join(HERE, "pdfs/issue-336-example.pdf")
with pdfplumber.open(path) as pdf:
tables = pdf.pages[0].extract_tables()
assert len(tables) == 3
assert len(tables[0]) == 8
assert len(tables[1]) == 11
assert len(tables[2]) == 2

0 comments on commit d3c6034

Please sign in to comment.