From 184e528278852ebc49391ab530909018b8deac22 Mon Sep 17 00:00:00 2001 From: Enrique Date: Tue, 15 Oct 2024 12:13:00 +0200 Subject: [PATCH 1/4] Add rm funcionality --- pdfly/cat.py | 17 +++- pdfly/cli.py | 24 ++++++ tests/test_rm.py | 215 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 253 insertions(+), 3 deletions(-) create mode 100644 tests/test_rm.py diff --git a/pdfly/cat.py b/pdfly/cat.py index 3481b71..8e894ea 100644 --- a/pdfly/cat.py +++ b/pdfly/cat.py @@ -53,7 +53,11 @@ def main( - filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool + filename: Path, + fn_pgrgs: List[str], + output: Path, + verbose: bool, + use_complements: bool = False, ) -> None: filename_page_ranges = parse_filepaths_and_pagerange_args( filename, fn_pgrgs @@ -87,8 +91,15 @@ def main( f"WARNING: Page range {page_range} is out of bounds", file=sys.stderr, ) - for page_num in range(*page_range.indices(len(reader.pages))): - writer.add_page(reader.pages[page_num]) + if not use_complements: + for page_num in range(*page_range.indices(len(reader.pages))): + writer.add_page(reader.pages[page_num]) + else: + all_page_nums = set(range(len(reader.pages))) + page_nums = set(range(*page_range.indices(len(reader.pages)))) + compl_page_nums = all_page_nums - page_nums + for page_num in compl_page_nums: + writer.add_page(reader.pages[page_num]) writer.write(output_fh) except Exception: print(traceback.format_exc(), file=sys.stderr) diff --git a/pdfly/cli.py b/pdfly/cli.py index 4b377f4..0282efd 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -103,6 +103,30 @@ def cat( pdfly.cat.main(filename, fn_pgrgs, output, verbose) +@entry_point.command(name="rm") # type: ignore[misc] +def rm( + filename: Annotated[ + Path, + typer.Argument( + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + ), + ], + output: Path = typer.Option(..., "-o", "--output"), # noqa + fn_pgrgs: List[str] = typer.Argument( # noqa + ..., help="filenames and/or page ranges" + ), + verbose: bool = typer.Option( + False, help="show page ranges as they are being read" + ), +) -> None: + pdfly.cat.main(filename, fn_pgrgs, output, verbose, use_complements=True) + + @entry_point.command(name="meta") # type: ignore[misc] def metadata( pdf: Annotated[ diff --git a/tests/test_rm.py b/tests/test_rm.py new file mode 100644 index 0000000..3be1e5c --- /dev/null +++ b/tests/test_rm.py @@ -0,0 +1,215 @@ +import pytest +from pypdf import PdfReader + +from .conftest import RESOURCES_ROOT, chdir, run_cli + + +def test_rm_incorrect_number_of_args(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")]) + assert exit_code == 2 + captured = capsys.readouterr() + assert "Missing argument" in captured.err + + +def test_rm_subset_ok(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli( + [ + "rm", + str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), + "13:15", + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 0, captured + assert not captured.err + inp_reader = PdfReader( + RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" + ) + out_reader = PdfReader(tmp_path / "out.pdf") + assert len(out_reader.pages) == len(inp_reader.pages) - 2 + + +@pytest.mark.parametrize( + "page_range", + ["a", "-", "1-", "1-1-1", "1:1:1:1"], +) +def test_rm_subset_invalid_args(capsys, tmp_path, page_range): + with chdir(tmp_path): + exit_code = run_cli( + [ + "rm", + str(RESOURCES_ROOT / "jpeg.pdf"), + page_range, + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 2, captured + assert "Invalid file path or page range provided" in captured.err + + +@pytest.mark.skip(reason="This check is not implemented yet") +def test_rm_subset_warn_on_missing_pages(capsys, tmp_path): + with chdir(tmp_path): + exit_code = run_cli( + [ + "rm", + str(RESOURCES_ROOT / "jpeg.pdf"), + "2", + "--output", + "./out.pdf", + ] + ) + captured = capsys.readouterr() + assert exit_code == 0, captured + assert "WARN" in captured.out + + +@pytest.mark.xfail() # There is currently a bug there +def test_rm_subset_ensure_reduced_size(tmp_path, two_pages_pdf_filepath): + exit_code = run_cli( + [ + "rm", + str(two_pages_pdf_filepath), + "0", + "--output", + str(tmp_path / "page1.pdf"), + ] + ) + assert exit_code == 0 + # The extracted PDF should only contain ONE image: + embedded_images = extract_embedded_images(tmp_path / "page1.pdf") + assert len(embedded_images) == 1 + + exit_code = run_cli( + [ + "rm", + str(two_pages_pdf_filepath), + "1", + "--output", + str(tmp_path / "page2.pdf"), + ] + ) + assert exit_code == 0 + # The extracted PDF should only contain ONE image: + embedded_images = extract_embedded_images(tmp_path / "page2.pdf") + assert len(embedded_images) == 1 + + +def extract_embedded_images(pdf_filepath): + images = [] + reader = PdfReader(pdf_filepath) + for page in reader.pages: + images.extend(page.images) + return images + + +def test_rm_combine_files(pdf_file_100, pdf_file_abc, tmp_path, capsys): + with chdir(tmp_path): + output_pdf_path = tmp_path / "out.pdf" + + # Run pdfly rm command + exit_code = run_cli( + [ + "rm", + str(pdf_file_100), + "1:10:2", + str(pdf_file_abc), + "::2", + str(pdf_file_abc), + "1::2", + "--output", + str(output_pdf_path), + ] + ) + captured = capsys.readouterr() + + # Check if the command was successful + assert exit_code == 0, captured.out + + # Extract text from the original and modified PDFs + extracted_pages = [] + reader = PdfReader(output_pdf_path) + for page in reader.pages: + extracted_pages.append(page.extract_text()) + + # Compare the extracted text + l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))] + assert extracted_pages == l1 + [ + "b", + "d", + "f", + "h", + "j", + "l", + "n", + "p", + "r", + "t", + "v", + "x", + "z", + "a", + "c", + "e", + "g", + "i", + "k", + "m", + "o", + "q", + "s", + "u", + "w", + "y", + ] + + +@pytest.mark.parametrize( + ("page_range", "expected"), + [ + ("22", [str(el) for el in range(100) if el != 22]), + ("0:3", [str(el) for el in range(3, 100)]), + (":3", [str(el) for el in range(3, 100)]), + (":", []), + ("5:", ["0", "1", "2", "3", "4"]), + ("::2", [str(el) for el in list(range(100))[1::2]]), + ( + "1:10:2", + [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))], + ), + ("::1", []), + ("::-1", []), + ], +) +def test_rm_commands(pdf_file_100, capsys, tmp_path, page_range, expected): + with chdir(tmp_path): + output_pdf_path = tmp_path / "out.pdf" + + # Run pdfly rm command + exit_code = run_cli( + [ + "rm", + str(pdf_file_100), + page_range, + "--output", + str(output_pdf_path), + ] + ) + + # Check if the command was successful + assert exit_code == 0 + + # Extract text from the original and modified PDFs + extracted_pages = [] + reader = PdfReader(output_pdf_path) + for page in reader.pages: + extracted_pages.append(page.extract_text()) + + # Compare the extracted text + assert extracted_pages == expected From 645041788bdac29da2b66327caa12b1caed0c757 Mon Sep 17 00:00:00 2001 From: Enrique Date: Fri, 18 Oct 2024 12:33:21 +0200 Subject: [PATCH 2/4] Resolve PR review comments --- pdfly/cat.py | 14 +++++++------- pdfly/cli.py | 4 +++- tests/test_rm.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pdfly/cat.py b/pdfly/cat.py index 8e894ea..eacb2c0 100644 --- a/pdfly/cat.py +++ b/pdfly/cat.py @@ -57,7 +57,7 @@ def main( fn_pgrgs: List[str], output: Path, verbose: bool, - use_complements: bool = False, + inverted_page_selection: bool = False, ) -> None: filename_page_ranges = parse_filepaths_and_pagerange_args( filename, fn_pgrgs @@ -91,14 +91,14 @@ def main( f"WARNING: Page range {page_range} is out of bounds", file=sys.stderr, ) - if not use_complements: - for page_num in range(*page_range.indices(len(reader.pages))): - writer.add_page(reader.pages[page_num]) - else: + if inverted_page_selection: all_page_nums = set(range(len(reader.pages))) page_nums = set(range(*page_range.indices(len(reader.pages)))) - compl_page_nums = all_page_nums - page_nums - for page_num in compl_page_nums: + inverted_page_nums = all_page_nums - page_nums + for page_num in inverted_page_nums: + writer.add_page(reader.pages[page_num]) + else: + for page_num in range(*page_range.indices(len(reader.pages))): writer.add_page(reader.pages[page_num]) writer.write(output_fh) except Exception: diff --git a/pdfly/cli.py b/pdfly/cli.py index 0282efd..5fea112 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -124,7 +124,9 @@ def rm( False, help="show page ranges as they are being read" ), ) -> None: - pdfly.cat.main(filename, fn_pgrgs, output, verbose, use_complements=True) + pdfly.cat.main( + filename, fn_pgrgs, output, verbose, inverted_page_selection=True + ) @entry_point.command(name="meta") # type: ignore[misc] diff --git a/tests/test_rm.py b/tests/test_rm.py index 3be1e5c..ad5cdc5 100644 --- a/tests/test_rm.py +++ b/tests/test_rm.py @@ -70,7 +70,7 @@ def test_rm_subset_warn_on_missing_pages(capsys, tmp_path): assert "WARN" in captured.out -@pytest.mark.xfail() # There is currently a bug there +@pytest.mark.xfail() def test_rm_subset_ensure_reduced_size(tmp_path, two_pages_pdf_filepath): exit_code = run_cli( [ From 5ecfb983ca5043a8f9467809fc749bccac5c34b7 Mon Sep 17 00:00:00 2001 From: Enrique Date: Mon, 21 Oct 2024 13:05:46 +0200 Subject: [PATCH 3/4] Fix bugs in test rm file --- tests/test_rm.py | 58 ++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/tests/test_rm.py b/tests/test_rm.py index ad5cdc5..b5cb38c 100644 --- a/tests/test_rm.py +++ b/tests/test_rm.py @@ -1,10 +1,19 @@ +"""Tests for the `rm` command.""" + +from pathlib import Path +from typing import List + import pytest +from _pytest.capture import CaptureFixture from pypdf import PdfReader from .conftest import RESOURCES_ROOT, chdir, run_cli +from .test_cat import extract_embedded_images -def test_rm_incorrect_number_of_args(capsys, tmp_path): +def test_rm_incorrect_number_of_args( + capsys: CaptureFixture, tmp_path: Path +) -> None: with chdir(tmp_path): exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")]) assert exit_code == 2 @@ -12,7 +21,7 @@ def test_rm_incorrect_number_of_args(capsys, tmp_path): assert "Missing argument" in captured.err -def test_rm_subset_ok(capsys, tmp_path): +def test_rm_subset_ok(capsys: CaptureFixture, tmp_path: Path) -> None: with chdir(tmp_path): exit_code = run_cli( [ @@ -37,7 +46,9 @@ def test_rm_subset_ok(capsys, tmp_path): "page_range", ["a", "-", "1-", "1-1-1", "1:1:1:1"], ) -def test_rm_subset_invalid_args(capsys, tmp_path, page_range): +def test_rm_subset_invalid_args( + capsys: CaptureFixture, tmp_path: Path, page_range: str +) -> None: with chdir(tmp_path): exit_code = run_cli( [ @@ -53,8 +64,9 @@ def test_rm_subset_invalid_args(capsys, tmp_path, page_range): assert "Invalid file path or page range provided" in captured.err -@pytest.mark.skip(reason="This check is not implemented yet") -def test_rm_subset_warn_on_missing_pages(capsys, tmp_path): +def test_rm_subset_warn_on_missing_pages( + capsys: CaptureFixture, tmp_path: Path +) -> None: with chdir(tmp_path): exit_code = run_cli( [ @@ -67,11 +79,12 @@ def test_rm_subset_warn_on_missing_pages(capsys, tmp_path): ) captured = capsys.readouterr() assert exit_code == 0, captured - assert "WARN" in captured.out + assert "WARN" in captured.err -@pytest.mark.xfail() -def test_rm_subset_ensure_reduced_size(tmp_path, two_pages_pdf_filepath): +def test_rm_subset_ensure_reduced_size( + tmp_path: Path, two_pages_pdf_filepath: Path +) -> None: exit_code = run_cli( [ "rm", @@ -101,15 +114,12 @@ def test_rm_subset_ensure_reduced_size(tmp_path, two_pages_pdf_filepath): assert len(embedded_images) == 1 -def extract_embedded_images(pdf_filepath): - images = [] - reader = PdfReader(pdf_filepath) - for page in reader.pages: - images.extend(page.images) - return images - - -def test_rm_combine_files(pdf_file_100, pdf_file_abc, tmp_path, capsys): +def test_rm_combine_files( + pdf_file_100: Path, + pdf_file_abc: Path, + tmp_path: Path, + capsys: CaptureFixture, +) -> None: with chdir(tmp_path): output_pdf_path = tmp_path / "out.pdf" @@ -135,8 +145,7 @@ def test_rm_combine_files(pdf_file_100, pdf_file_abc, tmp_path, capsys): # Extract text from the original and modified PDFs extracted_pages = [] reader = PdfReader(output_pdf_path) - for page in reader.pages: - extracted_pages.append(page.extract_text()) + extracted_pages = [page.extract_text() for page in reader.pages] # Compare the extracted text l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))] @@ -187,7 +196,13 @@ def test_rm_combine_files(pdf_file_100, pdf_file_abc, tmp_path, capsys): ("::-1", []), ], ) -def test_rm_commands(pdf_file_100, capsys, tmp_path, page_range, expected): +def test_rm_commands( + pdf_file_100: Path, + capsys: CaptureFixture, + tmp_path: Path, + page_range: str, + expected: List[str], +) -> None: with chdir(tmp_path): output_pdf_path = tmp_path / "out.pdf" @@ -208,8 +223,7 @@ def test_rm_commands(pdf_file_100, capsys, tmp_path, page_range, expected): # Extract text from the original and modified PDFs extracted_pages = [] reader = PdfReader(output_pdf_path) - for page in reader.pages: - extracted_pages.append(page.extract_text()) + extracted_pages = [page.extract_text() for page in reader.pages] # Compare the extracted text assert extracted_pages == expected From 944696951cb04e4aec9d6a0010f7587a29b3177f Mon Sep 17 00:00:00 2001 From: Enrique Date: Tue, 22 Oct 2024 09:20:02 +0200 Subject: [PATCH 4/4] Add rm module --- pdfly/cli.py | 7 +++---- pdfly/rm.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 pdfly/rm.py diff --git a/pdfly/cli.py b/pdfly/cli.py index b45d763..2353d8d 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -15,6 +15,7 @@ import pdfly.extract_images import pdfly.metadata import pdfly.pagemeta +import pdfly.rm import pdfly.up2 import pdfly.x2pdf @@ -104,7 +105,7 @@ def cat( pdfly.cat.main(filename, fn_pgrgs, output, verbose) -@entry_point.command(name="rm") # type: ignore[misc] +@entry_point.command(name="rm", help=pdfly.rm.__doc__) def rm( filename: Annotated[ Path, @@ -125,9 +126,7 @@ def rm( False, help="show page ranges as they are being read" ), ) -> None: - pdfly.cat.main( - filename, fn_pgrgs, output, verbose, inverted_page_selection=True - ) + pdfly.rm.main(filename, fn_pgrgs, output, verbose) @entry_point.command(name="meta", help=pdfly.metadata.__doc__) # type: ignore[misc] diff --git a/pdfly/rm.py b/pdfly/rm.py new file mode 100644 index 0000000..fc859e6 --- /dev/null +++ b/pdfly/rm.py @@ -0,0 +1,50 @@ +""" +Remove pages from PDF files. + +Page ranges refer to the previously-named file. +A file not followed by a page range means all the pages of the file. + +PAGE RANGES are like Python slices. + + Remember, page indices start with zero. + + Page range expression examples: + + : all pages. -1 last page. + 22 just the 23rd page. :-1 all but the last page. + 0:3 the first three pages. -2 second-to-last page. + :3 the first three pages. -2: last two pages. + 5: from the sixth page onward. -3:-1 third & second to last. + + The third, "stride" or "step" number is also recognized. + + ::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0. + 1:10:2 1 3 5 7 9 2::-1 2 1 0. + ::-1 all pages in reverse order. + +Examples + pdfly rm -o output.pdf document.pdf 2:5 + + Remove pages 2 to 4 from document.pdf, producing output.pdf. + + pdfly rm document.pdf :-1 + + Removes all pages except the last one from document.pdf, modifying the original file. + + pdfly rm report.pdf :6 7: + + Remove all pages except page seven from report.pdf, + producing a single-page report.pdf. + +""" + +from pathlib import Path +from typing import List + +from pdfly.cat import main as cat_main + + +def main( + filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool +) -> None: + cat_main(filename, fn_pgrgs, output, verbose, inverted_page_selection=True)