From a5a79ece7f366ad835fdd226564209f2c8017e78 Mon Sep 17 00:00:00 2001 From: Parth P Panchal Date: Sat, 20 Oct 2018 00:49:55 +0530 Subject: [PATCH 1/3] Renames camelot.cli to camelot.__main__ Closes #154 --- camelot/{cli.py => __main__.py} | 4 + setup.py | 2 +- tests/test_cli.py | 186 ++++++++++++++++---------------- 3 files changed, 98 insertions(+), 94 deletions(-) rename camelot/{cli.py => __main__.py} (99%) diff --git a/camelot/cli.py b/camelot/__main__.py similarity index 99% rename from camelot/cli.py rename to camelot/__main__.py index e4002048..7eaa32b1 100644 --- a/camelot/cli.py +++ b/camelot/__main__.py @@ -159,3 +159,7 @@ def stream(c, *args, **kwargs): if f is None: raise click.UsageError('Please specify output file format using --format') tables.export(output, f=f, compress=compress) + + +if __name__ == "__main__": + cli() diff --git a/setup.py b/setup.py index e727706e..c35b9eea 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def setup_package(): }, entry_points={ 'console_scripts': [ - 'camelot = camelot.cli:cli', + 'camelot = camelot.__main__:cli', ], }, classifiers=[ diff --git a/tests/test_cli.py b/tests/test_cli.py index 4797eaef..a31526a8 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,93 +1,93 @@ -# -*- coding: utf-8 -*- - -import os - -from click.testing import CliRunner - -from camelot.cli import cli -from camelot.utils import TemporaryDirectory - - -testdir = os.path.dirname(os.path.abspath(__file__)) -testdir = os.path.join(testdir, 'files') - - -def test_cli_lattice(): - with TemporaryDirectory() as tempdir: - infile = os.path.join(testdir, 'foo.pdf') - outfile = os.path.join(tempdir, 'foo.csv') - runner = CliRunner() - result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, - 'lattice', infile]) - assert result.exit_code == 0 - assert result.output == 'Found 1 tables\n' - - result = runner.invoke(cli, ['--format', 'csv', - 'lattice', infile]) - output_error = 'Error: Please specify output file path using --output' - assert output_error in result.output - - result = runner.invoke(cli, ['--output', outfile, - 'lattice', infile]) - format_error = 'Please specify output file format using --format' - assert format_error in result.output - - -def test_cli_stream(): - with TemporaryDirectory() as tempdir: - infile = os.path.join(testdir, 'budget.pdf') - outfile = os.path.join(tempdir, 'budget.csv') - runner = CliRunner() - result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, - 'stream', infile]) - assert result.exit_code == 0 - assert result.output == 'Found 1 tables\n' - - result = runner.invoke(cli, ['--format', 'csv', 'stream', infile]) - output_error = 'Error: Please specify output file path using --output' - assert output_error in result.output - - result = runner.invoke(cli, ['--output', outfile, 'stream', infile]) - format_error = 'Please specify output file format using --format' - assert format_error in result.output - - -def test_cli_output_format(): - with TemporaryDirectory() as tempdir: - infile = os.path.join(testdir, 'health.pdf') - outfile = os.path.join(tempdir, 'health.{}') - runner = CliRunner() - - # json - result = runner.invoke(cli, ['--format', 'json', '--output', outfile.format('json'), - 'stream', infile]) - assert result.exit_code == 0 - - # excel - result = runner.invoke(cli, ['--format', 'excel', '--output', outfile.format('xlsx'), - 'stream', infile]) - assert result.exit_code == 0 - - # html - result = runner.invoke(cli, ['--format', 'html', '--output', outfile.format('html'), - 'stream', infile]) - assert result.exit_code == 0 - - # zip - result = runner.invoke(cli, ['--zip', '--format', 'csv', '--output', outfile.format('csv'), - 'stream', infile]) - assert result.exit_code == 0 - -def test_cli_quiet_flag(): - with TemporaryDirectory() as tempdir: - infile = os.path.join(testdir, 'blank.pdf') - outfile = os.path.join(tempdir, 'blank.csv') - runner = CliRunner() - - result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, - 'stream', infile]) - assert 'No tables found on page-1' in result.output - - result = runner.invoke(cli, ['--quiet', '--format', 'csv', - '--output', outfile, 'stream', infile]) - assert 'No tables found on page-1' not in result.output +# -*- coding: utf-8 -*- + +import os + +from click.testing import CliRunner + +from camelot.__main__ import cli +from camelot.utils import TemporaryDirectory + + +testdir = os.path.dirname(os.path.abspath(__file__)) +testdir = os.path.join(testdir, 'files') + + +def test_cli_lattice(): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, 'foo.pdf') + outfile = os.path.join(tempdir, 'foo.csv') + runner = CliRunner() + result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, + 'lattice', infile]) + assert result.exit_code == 0 + assert result.output == 'Found 1 tables\n' + + result = runner.invoke(cli, ['--format', 'csv', + 'lattice', infile]) + output_error = 'Error: Please specify output file path using --output' + assert output_error in result.output + + result = runner.invoke(cli, ['--output', outfile, + 'lattice', infile]) + format_error = 'Please specify output file format using --format' + assert format_error in result.output + + +def test_cli_stream(): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, 'budget.pdf') + outfile = os.path.join(tempdir, 'budget.csv') + runner = CliRunner() + result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, + 'stream', infile]) + assert result.exit_code == 0 + assert result.output == 'Found 1 tables\n' + + result = runner.invoke(cli, ['--format', 'csv', 'stream', infile]) + output_error = 'Error: Please specify output file path using --output' + assert output_error in result.output + + result = runner.invoke(cli, ['--output', outfile, 'stream', infile]) + format_error = 'Please specify output file format using --format' + assert format_error in result.output + + +def test_cli_output_format(): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, 'health.pdf') + outfile = os.path.join(tempdir, 'health.{}') + runner = CliRunner() + + # json + result = runner.invoke(cli, ['--format', 'json', '--output', outfile.format('json'), + 'stream', infile]) + assert result.exit_code == 0 + + # excel + result = runner.invoke(cli, ['--format', 'excel', '--output', outfile.format('xlsx'), + 'stream', infile]) + assert result.exit_code == 0 + + # html + result = runner.invoke(cli, ['--format', 'html', '--output', outfile.format('html'), + 'stream', infile]) + assert result.exit_code == 0 + + # zip + result = runner.invoke(cli, ['--zip', '--format', 'csv', '--output', outfile.format('csv'), + 'stream', infile]) + assert result.exit_code == 0 + +def test_cli_quiet_flag(): + with TemporaryDirectory() as tempdir: + infile = os.path.join(testdir, 'blank.pdf') + outfile = os.path.join(tempdir, 'blank.csv') + runner = CliRunner() + + result = runner.invoke(cli, ['--format', 'csv', '--output', outfile, + 'stream', infile]) + assert 'No tables found on page-1' in result.output + + result = runner.invoke(cli, ['--quiet', '--format', 'csv', + '--output', outfile, 'stream', infile]) + assert 'No tables found on page-1' not in result.output From 873dfb3d286ed97d5c41cfe2ff1a39b6c7e51157 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 23 Oct 2018 05:08:57 +0530 Subject: [PATCH 2/3] Keep __main__ and cli separate --- camelot/__main__.py | 161 ++------------------------------------------ camelot/cli.py | 161 ++++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- tests/test_cli.py | 2 +- 4 files changed, 169 insertions(+), 157 deletions(-) mode change 100644 => 100755 camelot/__main__.py create mode 100644 camelot/cli.py diff --git a/camelot/__main__.py b/camelot/__main__.py old mode 100644 new mode 100755 index 7eaa32b1..c945051b --- a/camelot/__main__.py +++ b/camelot/__main__.py @@ -1,165 +1,16 @@ # -*- coding: utf-8 -*- -import logging +from __future__ import absolute_import -import click -from . import __version__ -from .io import read_pdf +__all__ = ('main',) -logger = logging.getLogger('camelot') -logger.setLevel(logging.INFO) +def main(): + from camelot.cli import cli - -class Config(object): - def __init__(self): - self.config = {} - - def set_config(self, key, value): - self.config[key] = value - - -pass_config = click.make_pass_decorator(Config) - - -@click.group() -@click.version_option(version=__version__) -@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' - ' Example: 1,3,4 or 1,4-end.') -@click.option('-o', '--output', help='Output file path.') -@click.option('-f', '--format', - type=click.Choice(['csv', 'json', 'excel', 'html']), - help='Output file format.') -@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') -@click.option('-split', '--split_text', is_flag=True, - help='Split text that spans across multiple cells.') -@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' - ' font size. Useful to detect super/subscripts.') -@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), - help='PDFMiner char_margin, line_margin and word_margin.') -@click.option('-q', '--quiet', is_flag=True, help='Suppress warnings.') -@click.pass_context -def cli(ctx, *args, **kwargs): - """Camelot: PDF Table Extraction for Humans""" - ctx.obj = Config() - for key, value in kwargs.items(): - ctx.obj.set_config(key, value) - - -@cli.command('lattice') -@click.option('-T', '--table_area', default=[], multiple=True, - help='Table areas to process. Example: x1,y1,x2,y2' - ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') -@click.option('-back', '--process_background', is_flag=True, - help='Process background lines.') -@click.option('-scale', '--line_size_scaling', default=15, - help='Line size scaling factor. The larger the value,' - ' the smaller the detected lines.') -@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), - multiple=True, help='Direction in which text in a spanning cell' - ' will be copied over.') -@click.option('-shift', '--shift_text', default=['l', 't'], - type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, - help='Direction in which text in a spanning cell will flow.') -@click.option('-l', '--line_close_tol', default=2, - help='Tolerance parameter used to merge close vertical' - ' and horizontal lines.') -@click.option('-j', '--joint_close_tol', default=2, - help='Tolerance parameter used to decide whether' - ' the detected lines and points lie close to each other.') -@click.option('-block', '--threshold_blocksize', default=15, - help='For adaptive thresholding, size of a pixel' - ' neighborhood that is used to calculate a threshold value for' - ' the pixel. Example: 3, 5, 7, and so on.') -@click.option('-const', '--threshold_constant', default=-2, - help='For adaptive thresholding, constant subtracted' - ' from the mean or weighted mean. Normally, it is positive but' - ' may be zero or negative as well.') -@click.option('-I', '--iterations', default=0, - help='Number of times for erosion/dilation will be applied.') -@click.option('-plot', '--plot_type', - type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), - help='Plot geometry found on PDF page, for debugging.') -@click.argument('filepath', type=click.Path(exists=True)) -@pass_config -def lattice(c, *args, **kwargs): - """Use lines between text to parse the table.""" - conf = c.config - pages = conf.pop('pages') - output = conf.pop('output') - f = conf.pop('format') - compress = conf.pop('zip') - suppress_warnings = conf.pop('quiet') - plot_type = kwargs.pop('plot_type') - filepath = kwargs.pop('filepath') - kwargs.update(conf) - - table_area = list(kwargs['table_area']) - kwargs['table_area'] = None if not table_area else table_area - copy_text = list(kwargs['copy_text']) - kwargs['copy_text'] = None if not copy_text else copy_text - kwargs['shift_text'] = list(kwargs['shift_text']) - - tables = read_pdf(filepath, pages=pages, flavor='lattice', - suppress_warnings=suppress_warnings, **kwargs) - click.echo('Found {} tables'.format(tables.n)) - if plot_type is not None: - for table in tables: - table.plot(plot_type) - else: - if output is None: - raise click.UsageError('Please specify output file path using --output') - if f is None: - raise click.UsageError('Please specify output file format using --format') - tables.export(output, f=f, compress=compress) - - -@cli.command('stream') -@click.option('-T', '--table_area', default=[], multiple=True, - help='Table areas to process. Example: x1,y1,x2,y2' - ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') -@click.option('-C', '--columns', default=[], multiple=True, - help='X coordinates of column separators.') -@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' - ' used to combine text vertically, to generate rows.') -@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' - ' used to combine text horizontally, to generate columns.') -@click.option('-plot', '--plot_type', - type=click.Choice(['text', 'table']), - help='Plot geometry found on PDF page for debugging.') -@click.argument('filepath', type=click.Path(exists=True)) -@pass_config -def stream(c, *args, **kwargs): - """Use spaces between text to parse the table.""" - conf = c.config - pages = conf.pop('pages') - output = conf.pop('output') - f = conf.pop('format') - compress = conf.pop('zip') - suppress_warnings = conf.pop('quiet') - plot_type = kwargs.pop('plot_type') - filepath = kwargs.pop('filepath') - kwargs.update(conf) - - table_area = list(kwargs['table_area']) - kwargs['table_area'] = None if not table_area else table_area - columns = list(kwargs['columns']) - kwargs['columns'] = None if not columns else columns - - tables = read_pdf(filepath, pages=pages, flavor='stream', - suppress_warnings=suppress_warnings, **kwargs) - click.echo('Found {} tables'.format(tables.n)) - if plot_type is not None: - for table in tables: - table.plot(plot_type) - else: - if output is None: - raise click.UsageError('Please specify output file path using --output') - if f is None: - raise click.UsageError('Please specify output file format using --format') - tables.export(output, f=f, compress=compress) + cli() if __name__ == "__main__": - cli() + main() diff --git a/camelot/cli.py b/camelot/cli.py new file mode 100644 index 00000000..e4002048 --- /dev/null +++ b/camelot/cli.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +import logging + +import click + +from . import __version__ +from .io import read_pdf + + +logger = logging.getLogger('camelot') +logger.setLevel(logging.INFO) + + +class Config(object): + def __init__(self): + self.config = {} + + def set_config(self, key, value): + self.config[key] = value + + +pass_config = click.make_pass_decorator(Config) + + +@click.group() +@click.version_option(version=__version__) +@click.option('-p', '--pages', default='1', help='Comma-separated page numbers.' + ' Example: 1,3,4 or 1,4-end.') +@click.option('-o', '--output', help='Output file path.') +@click.option('-f', '--format', + type=click.Choice(['csv', 'json', 'excel', 'html']), + help='Output file format.') +@click.option('-z', '--zip', is_flag=True, help='Create ZIP archive.') +@click.option('-split', '--split_text', is_flag=True, + help='Split text that spans across multiple cells.') +@click.option('-flag', '--flag_size', is_flag=True, help='Flag text based on' + ' font size. Useful to detect super/subscripts.') +@click.option('-M', '--margins', nargs=3, default=(1.0, 0.5, 0.1), + help='PDFMiner char_margin, line_margin and word_margin.') +@click.option('-q', '--quiet', is_flag=True, help='Suppress warnings.') +@click.pass_context +def cli(ctx, *args, **kwargs): + """Camelot: PDF Table Extraction for Humans""" + ctx.obj = Config() + for key, value in kwargs.items(): + ctx.obj.set_config(key, value) + + +@cli.command('lattice') +@click.option('-T', '--table_area', default=[], multiple=True, + help='Table areas to process. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') +@click.option('-back', '--process_background', is_flag=True, + help='Process background lines.') +@click.option('-scale', '--line_size_scaling', default=15, + help='Line size scaling factor. The larger the value,' + ' the smaller the detected lines.') +@click.option('-copy', '--copy_text', default=[], type=click.Choice(['h', 'v']), + multiple=True, help='Direction in which text in a spanning cell' + ' will be copied over.') +@click.option('-shift', '--shift_text', default=['l', 't'], + type=click.Choice(['', 'l', 'r', 't', 'b']), multiple=True, + help='Direction in which text in a spanning cell will flow.') +@click.option('-l', '--line_close_tol', default=2, + help='Tolerance parameter used to merge close vertical' + ' and horizontal lines.') +@click.option('-j', '--joint_close_tol', default=2, + help='Tolerance parameter used to decide whether' + ' the detected lines and points lie close to each other.') +@click.option('-block', '--threshold_blocksize', default=15, + help='For adaptive thresholding, size of a pixel' + ' neighborhood that is used to calculate a threshold value for' + ' the pixel. Example: 3, 5, 7, and so on.') +@click.option('-const', '--threshold_constant', default=-2, + help='For adaptive thresholding, constant subtracted' + ' from the mean or weighted mean. Normally, it is positive but' + ' may be zero or negative as well.') +@click.option('-I', '--iterations', default=0, + help='Number of times for erosion/dilation will be applied.') +@click.option('-plot', '--plot_type', + type=click.Choice(['text', 'table', 'contour', 'joint', 'line']), + help='Plot geometry found on PDF page, for debugging.') +@click.argument('filepath', type=click.Path(exists=True)) +@pass_config +def lattice(c, *args, **kwargs): + """Use lines between text to parse the table.""" + conf = c.config + pages = conf.pop('pages') + output = conf.pop('output') + f = conf.pop('format') + compress = conf.pop('zip') + suppress_warnings = conf.pop('quiet') + plot_type = kwargs.pop('plot_type') + filepath = kwargs.pop('filepath') + kwargs.update(conf) + + table_area = list(kwargs['table_area']) + kwargs['table_area'] = None if not table_area else table_area + copy_text = list(kwargs['copy_text']) + kwargs['copy_text'] = None if not copy_text else copy_text + kwargs['shift_text'] = list(kwargs['shift_text']) + + tables = read_pdf(filepath, pages=pages, flavor='lattice', + suppress_warnings=suppress_warnings, **kwargs) + click.echo('Found {} tables'.format(tables.n)) + if plot_type is not None: + for table in tables: + table.plot(plot_type) + else: + if output is None: + raise click.UsageError('Please specify output file path using --output') + if f is None: + raise click.UsageError('Please specify output file format using --format') + tables.export(output, f=f, compress=compress) + + +@cli.command('stream') +@click.option('-T', '--table_area', default=[], multiple=True, + help='Table areas to process. Example: x1,y1,x2,y2' + ' where x1, y1 -> left-top and x2, y2 -> right-bottom.') +@click.option('-C', '--columns', default=[], multiple=True, + help='X coordinates of column separators.') +@click.option('-r', '--row_close_tol', default=2, help='Tolerance parameter' + ' used to combine text vertically, to generate rows.') +@click.option('-c', '--col_close_tol', default=0, help='Tolerance parameter' + ' used to combine text horizontally, to generate columns.') +@click.option('-plot', '--plot_type', + type=click.Choice(['text', 'table']), + help='Plot geometry found on PDF page for debugging.') +@click.argument('filepath', type=click.Path(exists=True)) +@pass_config +def stream(c, *args, **kwargs): + """Use spaces between text to parse the table.""" + conf = c.config + pages = conf.pop('pages') + output = conf.pop('output') + f = conf.pop('format') + compress = conf.pop('zip') + suppress_warnings = conf.pop('quiet') + plot_type = kwargs.pop('plot_type') + filepath = kwargs.pop('filepath') + kwargs.update(conf) + + table_area = list(kwargs['table_area']) + kwargs['table_area'] = None if not table_area else table_area + columns = list(kwargs['columns']) + kwargs['columns'] = None if not columns else columns + + tables = read_pdf(filepath, pages=pages, flavor='stream', + suppress_warnings=suppress_warnings, **kwargs) + click.echo('Found {} tables'.format(tables.n)) + if plot_type is not None: + for table in tables: + table.plot(plot_type) + else: + if output is None: + raise click.UsageError('Please specify output file path using --output') + if f is None: + raise click.UsageError('Please specify output file format using --format') + tables.export(output, f=f, compress=compress) diff --git a/setup.py b/setup.py index c35b9eea..e727706e 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def setup_package(): }, entry_points={ 'console_scripts': [ - 'camelot = camelot.__main__:cli', + 'camelot = camelot.cli:cli', ], }, classifiers=[ diff --git a/tests/test_cli.py b/tests/test_cli.py index a31526a8..3f51f8f2 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,7 +4,7 @@ from click.testing import CliRunner -from camelot.__main__ import cli +from camelot.cli import cli from camelot.utils import TemporaryDirectory From d7a3a290a6f9f893180f2dd61f4046422798c7e7 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Tue, 23 Oct 2018 05:47:48 +0530 Subject: [PATCH 3/3] Monkey patch click HelpFormatter --- camelot/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/camelot/__init__.py b/camelot/__init__.py index d8ff6a54..364cd725 100644 --- a/camelot/__init__.py +++ b/camelot/__init__.py @@ -2,10 +2,20 @@ import logging +from click import HelpFormatter + from .__version__ import __version__ from .io import read_pdf +def _write_usage(self, prog, args='', prefix='Usage: '): + return self._write_usage('camelot', args, prefix=prefix) + + +# monkey patch click.HelpFormatter +HelpFormatter._write_usage = HelpFormatter.write_usage +HelpFormatter.write_usage = _write_usage + # set up logging logger = logging.getLogger('camelot')