Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfixes in utils.split_textline, core.Table.to_excel + various linting changes #72

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ jobs:
- name: Install ghostscript
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install ghostscript

- name: Compute pre-commit cache key
Expand Down
34 changes: 20 additions & 14 deletions camelot/core.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from __future__ import annotations

import os
import sqlite3
import tempfile
import zipfile
from itertools import chain
from operator import itemgetter
from typing import Iterator, Iterable

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -604,14 +607,11 @@ def to_excel(self, path, **kwargs):
Output filepath.

"""
kw = {
"sheet_name": f"page-{self.page}-table-{self.order}",
"encoding": "utf-8",
}
kw = {"encoding": "utf-8"}
sheet_name = f"page-{self.page}-table-{self.order}"
kw.update(kwargs)
writer = pd.ExcelWriter(path)
self.df.to_excel(writer, **kw)
writer.save()
self.df.to_excel(writer, sheet_name=sheet_name, **kw)

def to_html(self, path, **kwargs):
"""Writes Table to an HTML file.
Expand Down Expand Up @@ -674,27 +674,33 @@ class TableList:

"""

def __init__(self, tables):
self._tables = tables
def __init__(self, tables: Iterable[Table]) -> None:
self._tables: Iterable[Table] = tables

def __repr__(self):
def __repr__(self) -> str:
return f"<{self.__class__.__name__} n={self.n}>"

def __len__(self):
def __len__(self) -> int:
return len(self._tables)

def __getitem__(self, idx):
def __getitem__(self, idx) -> Table:
return self._tables[idx]

def __iter__(self) -> Iterator[Table]:
return iter(self._tables)

def __next__(self) -> Table:
return next(self)

@staticmethod
def _format_func(table, f):
return getattr(table, f"to_{f}")

@property
def n(self):
def n(self) -> int:
return len(self)

def _write_file(self, f=None, **kwargs):
def _write_file(self, f=None, **kwargs) -> None:
dirname = kwargs.get("dirname")
root = kwargs.get("root")
ext = kwargs.get("ext")
Expand All @@ -704,7 +710,7 @@ def _write_file(self, f=None, **kwargs):
to_format = self._format_func(table, f)
to_format(filepath)

def _compress_dir(self, **kwargs):
def _compress_dir(self, **kwargs) -> None:
path = kwargs.get("path")
dirname = kwargs.get("dirname")
root = kwargs.get("root")
Expand Down
20 changes: 10 additions & 10 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
from pathlib import Path
from typing import Union

from pypdf import PdfReader
from pypdf import PdfWriter
from pypdf import PdfReader, PdfWriter
from pypdf._utils import StrByteType

from .core import TableList
from .parsers import Lattice
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import get_page_layout
from .utils import get_rotation
from .utils import get_text_objects
from .utils import is_url
from .parsers import Lattice, Stream
from .utils import (
TemporaryDirectory,
download_url,
get_page_layout,
get_rotation,
get_text_objects,
is_url,
)


class PDFHandler:
Expand Down
3 changes: 1 addition & 2 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
from pypdf._utils import StrByteType

from .handlers import PDFHandler
from .utils import remove_extra
from .utils import validate_input
from .utils import remove_extra, validate_input


def read_pdf(
Expand Down
31 changes: 17 additions & 14 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@

from ..backends.image_conversion import BACKENDS
from ..core import Table
from ..image_processing import adaptive_threshold
from ..image_processing import find_contours
from ..image_processing import find_joints
from ..image_processing import find_lines
from ..utils import compute_accuracy
from ..utils import compute_whitespace
from ..utils import get_table_index
from ..utils import merge_close_lines
from ..utils import scale_image
from ..utils import scale_pdf
from ..utils import segments_in_bbox
from ..utils import text_in_bbox
from ..image_processing import (
adaptive_threshold,
find_contours,
find_joints,
find_lines,
)
from ..utils import (
compute_accuracy,
compute_whitespace,
get_table_index,
merge_close_lines,
scale_image,
scale_pdf,
segments_in_bbox,
text_in_bbox,
)
from .base import BaseParser


logger = logging.getLogger("camelot")


Expand Down Expand Up @@ -361,7 +364,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
flag_size=self.flag_size,
strip_text=self.strip_text,
)
if indices[0][:2] != (-1, -1):
if len(indices) < 1 or indices[0][:2] != (-1, -1):
pos_errors.append(error)
indices = Lattice._reduce_index(
table, indices, shift_text=self.shift_text
Expand Down
34 changes: 16 additions & 18 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,24 @@
from itertools import groupby
from operator import itemgetter
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc
from urllib.parse import uses_params
from urllib.parse import uses_relative
from urllib.request import Request
from urllib.request import urlopen
from urllib.parse import uses_netloc, uses_params, uses_relative
from urllib.request import Request, urlopen

import numpy as np
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.layout import LTAnno
from pdfminer.layout import LTChar
from pdfminer.layout import LTImage
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTTextLineVertical
from pdfminer.layout import (
LAParams,
LTAnno,
LTChar,
LTImage,
LTTextLineHorizontal,
LTTextLineVertical,
)
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser


_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")

Expand Down Expand Up @@ -622,7 +618,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
else:
# TODO: add test
if cut == x_cuts[-1]:
cut_text.append((r, cut[0] + 1, obj))
new_idx = min(cut[0] + 1, len(table.cols) - 1)
cut_text.append((r, new_idx, obj))
elif isinstance(obj, LTAnno):
cut_text.append((r, cut[0], obj))
elif direction == "vertical" and not textline.is_empty():
Expand Down Expand Up @@ -655,7 +652,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
else:
# TODO: add test
if cut == y_cuts[-1]:
cut_text.append((cut[0] - 1, c, obj))
new_idx = max(cut[0] - 1, 0)
cut_text.append((new_idx, c, obj))
elif isinstance(obj, LTAnno):
cut_text.append((cut[0], c, obj))
except IndexError:
Expand Down