foarsitter · matt-dies-tenet3 · Jul 11, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -71,6 +71,7 @@ jobs:
       - name: Install ghostscript
         if: matrix.os == 'ubuntu-latest'
         run: |
+          sudo apt update
           sudo apt install ghostscript
 
       - name: Compute pre-commit cache key

diff --git a/camelot/core.py b/camelot/core.py
@@ -1,9 +1,12 @@
+from __future__ import annotations
+
 import os
 import sqlite3
 import tempfile
 import zipfile
 from itertools import chain
 from operator import itemgetter
+from typing import Iterator, Iterable
 
 import numpy as np
 import pandas as pd
@@ -604,14 +607,11 @@ def to_excel(self, path, **kwargs):
             Output filepath.
 
         """
-        kw = {
-            "sheet_name": f"page-{self.page}-table-{self.order}",
-            "encoding": "utf-8",
-        }
+        kw = {"encoding": "utf-8"}
+        sheet_name = f"page-{self.page}-table-{self.order}"
         kw.update(kwargs)
         writer = pd.ExcelWriter(path)
-        self.df.to_excel(writer, **kw)
-        writer.save()
+        self.df.to_excel(writer, sheet_name=sheet_name, **kw)
 
     def to_html(self, path, **kwargs):
         """Writes Table to an HTML file.
@@ -674,27 +674,33 @@ class TableList:
 
     """
 
-    def __init__(self, tables):
-        self._tables = tables
+    def __init__(self, tables: Iterable[Table]) -> None:
+        self._tables: Iterable[Table] = tables
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"<{self.__class__.__name__} n={self.n}>"
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._tables)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx) -> Table:
         return self._tables[idx]
 
+    def __iter__(self) -> Iterator[Table]:
+        return iter(self._tables)
+
+    def __next__(self) -> Table:
+        return next(self)
+
     @staticmethod
     def _format_func(table, f):
         return getattr(table, f"to_{f}")
 
     @property
-    def n(self):
+    def n(self) -> int:
         return len(self)
 
-    def _write_file(self, f=None, **kwargs):
+    def _write_file(self, f=None, **kwargs) -> None:
         dirname = kwargs.get("dirname")
         root = kwargs.get("root")
         ext = kwargs.get("ext")
@@ -704,7 +710,7 @@ def _write_file(self, f=None, **kwargs):
             to_format = self._format_func(table, f)
             to_format(filepath)
 
-    def _compress_dir(self, **kwargs):
+    def _compress_dir(self, **kwargs) -> None:
         path = kwargs.get("path")
         dirname = kwargs.get("dirname")
         root = kwargs.get("root")

diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -3,19 +3,19 @@
 from pathlib import Path
 from typing import Union
 
-from pypdf import PdfReader
-from pypdf import PdfWriter
+from pypdf import PdfReader, PdfWriter
 from pypdf._utils import StrByteType
 
 from .core import TableList
-from .parsers import Lattice
-from .parsers import Stream
-from .utils import TemporaryDirectory
-from .utils import download_url
-from .utils import get_page_layout
-from .utils import get_rotation
-from .utils import get_text_objects
-from .utils import is_url
+from .parsers import Lattice, Stream
+from .utils import (
+    TemporaryDirectory,
+    download_url,
+    get_page_layout,
+    get_rotation,
+    get_text_objects,
+    is_url,
+)
 
 
 class PDFHandler:

diff --git a/camelot/io.py b/camelot/io.py
@@ -5,8 +5,7 @@
 from pypdf._utils import StrByteType
 
 from .handlers import PDFHandler
-from .utils import remove_extra
-from .utils import validate_input
+from .utils import remove_extra, validate_input
 
 
 def read_pdf(

diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
@@ -10,21 +10,24 @@
 
 from ..backends.image_conversion import BACKENDS
 from ..core import Table
-from ..image_processing import adaptive_threshold
-from ..image_processing import find_contours
-from ..image_processing import find_joints
-from ..image_processing import find_lines
-from ..utils import compute_accuracy
-from ..utils import compute_whitespace
-from ..utils import get_table_index
-from ..utils import merge_close_lines
-from ..utils import scale_image
-from ..utils import scale_pdf
-from ..utils import segments_in_bbox
-from ..utils import text_in_bbox
+from ..image_processing import (
+    adaptive_threshold,
+    find_contours,
+    find_joints,
+    find_lines,
+)
+from ..utils import (
+    compute_accuracy,
+    compute_whitespace,
+    get_table_index,
+    merge_close_lines,
+    scale_image,
+    scale_pdf,
+    segments_in_bbox,
+    text_in_bbox,
+)
 from .base import BaseParser
 
-
 logger = logging.getLogger("camelot")
 
 
@@ -361,7 +364,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
                     flag_size=self.flag_size,
                     strip_text=self.strip_text,
                 )
-                if indices[0][:2] != (-1, -1):
+                if len(indices) < 1 or indices[0][:2] != (-1, -1):
                     pos_errors.append(error)
                     indices = Lattice._reduce_index(
                         table, indices, shift_text=self.shift_text

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -8,28 +8,24 @@
 from itertools import groupby
 from operator import itemgetter
 from urllib.parse import urlparse as parse_url
-from urllib.parse import uses_netloc
-from urllib.parse import uses_params
-from urllib.parse import uses_relative
-from urllib.request import Request
-from urllib.request import urlopen
+from urllib.parse import uses_netloc, uses_params, uses_relative
+from urllib.request import Request, urlopen
 
 import numpy as np
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams
-from pdfminer.layout import LTAnno
-from pdfminer.layout import LTChar
-from pdfminer.layout import LTImage
-from pdfminer.layout import LTTextLineHorizontal
-from pdfminer.layout import LTTextLineVertical
+from pdfminer.layout import (
+    LAParams,
+    LTAnno,
+    LTChar,
+    LTImage,
+    LTTextLineHorizontal,
+    LTTextLineVertical,
+)
 from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfinterp import PDFPageInterpreter
-from pdfminer.pdfinterp import PDFResourceManager
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfpage import PDFTextExtractionNotAllowed
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.pdfparser import PDFParser
 
-
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard("")
 
@@ -622,7 +618,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
                         else:
                             # TODO: add test
                             if cut == x_cuts[-1]:
-                                cut_text.append((r, cut[0] + 1, obj))
+                                new_idx = min(cut[0] + 1, len(table.cols) - 1)
+                                cut_text.append((r, new_idx, obj))
                     elif isinstance(obj, LTAnno):
                         cut_text.append((r, cut[0], obj))
         elif direction == "vertical" and not textline.is_empty():
@@ -655,7 +652,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
                         else:
                             # TODO: add test
                             if cut == y_cuts[-1]:
-                                cut_text.append((cut[0] - 1, c, obj))
+                                new_idx = max(cut[0] - 1, 0)
+                                cut_text.append((new_idx, c, obj))
                     elif isinstance(obj, LTAnno):
                         cut_text.append((cut[0], c, obj))
     except IndexError: