Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rm funcionality #59

Merged
merged 5 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions pdfly/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,11 @@


def main(
filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
filename: Path,
fn_pgrgs: List[str],
output: Path,
verbose: bool,
inverted_page_selection: bool = False,
) -> None:
filename_page_ranges = parse_filepaths_and_pagerange_args(
filename, fn_pgrgs
Expand Down Expand Up @@ -87,8 +91,15 @@ def main(
f"WARNING: Page range {page_range} is out of bounds",
file=sys.stderr,
)
for page_num in range(*page_range.indices(len(reader.pages))):
writer.add_page(reader.pages[page_num])
if inverted_page_selection:
all_page_nums = set(range(len(reader.pages)))
page_nums = set(range(*page_range.indices(len(reader.pages))))
inverted_page_nums = all_page_nums - page_nums
for page_num in inverted_page_nums:
writer.add_page(reader.pages[page_num])
else:
for page_num in range(*page_range.indices(len(reader.pages))):
writer.add_page(reader.pages[page_num])
writer.write(output_fh)
except Exception:
print(traceback.format_exc(), file=sys.stderr)
Expand Down
25 changes: 25 additions & 0 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pdfly.extract_images
import pdfly.metadata
import pdfly.pagemeta
import pdfly.rm
import pdfly.up2
import pdfly.x2pdf

Expand Down Expand Up @@ -104,6 +105,30 @@ def cat(
pdfly.cat.main(filename, fn_pgrgs, output, verbose)


@entry_point.command(name="rm", help=pdfly.rm.__doc__)
def rm(
filename: Annotated[
Path,
typer.Argument(
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
),
],
output: Path = typer.Option(..., "-o", "--output"), # noqa
fn_pgrgs: List[str] = typer.Argument( # noqa
..., help="filenames and/or page ranges"
),
verbose: bool = typer.Option(
False, help="show page ranges as they are being read"
),
) -> None:
pdfly.rm.main(filename, fn_pgrgs, output, verbose)


@entry_point.command(name="meta", help=pdfly.metadata.__doc__) # type: ignore[misc]
def metadata(
pdf: Annotated[
Expand Down
50 changes: 50 additions & 0 deletions pdfly/rm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Remove pages from PDF files.

Page ranges refer to the previously-named file.
A file not followed by a page range means all the pages of the file.

PAGE RANGES are like Python slices.

Remember, page indices start with zero.

Page range expression examples:

: all pages. -1 last page.
22 just the 23rd page. :-1 all but the last page.
0:3 the first three pages. -2 second-to-last page.
:3 the first three pages. -2: last two pages.
5: from the sixth page onward. -3:-1 third & second to last.

The third, "stride" or "step" number is also recognized.

::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
1:10:2 1 3 5 7 9 2::-1 2 1 0.
::-1 all pages in reverse order.

Examples
pdfly rm -o output.pdf document.pdf 2:5

Remove pages 2 to 4 from document.pdf, producing output.pdf.

pdfly rm document.pdf :-1

Removes all pages except the last one from document.pdf, modifying the original file.

pdfly rm report.pdf :6 7:

Remove all pages except page seven from report.pdf,
producing a single-page report.pdf.

"""

from pathlib import Path
from typing import List

from pdfly.cat import main as cat_main


def main(
filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
) -> None:
cat_main(filename, fn_pgrgs, output, verbose, inverted_page_selection=True)
229 changes: 229 additions & 0 deletions tests/test_rm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
"""Tests for the `rm` command."""

from pathlib import Path
from typing import List

import pytest
from _pytest.capture import CaptureFixture
from pypdf import PdfReader

from .conftest import RESOURCES_ROOT, chdir, run_cli
from .test_cat import extract_embedded_images


def test_rm_incorrect_number_of_args(
capsys: CaptureFixture, tmp_path: Path
) -> None:
with chdir(tmp_path):
exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")])
assert exit_code == 2
captured = capsys.readouterr()
assert "Missing argument" in captured.err


def test_rm_subset_ok(capsys: CaptureFixture, tmp_path: Path) -> None:
with chdir(tmp_path):
exit_code = run_cli(
[
"rm",
str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"),
"13:15",
"--output",
"./out.pdf",
]
)
captured = capsys.readouterr()
assert exit_code == 0, captured
assert not captured.err
inp_reader = PdfReader(
RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"
)
out_reader = PdfReader(tmp_path / "out.pdf")
assert len(out_reader.pages) == len(inp_reader.pages) - 2


@pytest.mark.parametrize(
"page_range",
["a", "-", "1-", "1-1-1", "1:1:1:1"],
)
def test_rm_subset_invalid_args(
capsys: CaptureFixture, tmp_path: Path, page_range: str
) -> None:
with chdir(tmp_path):
exit_code = run_cli(
[
"rm",
str(RESOURCES_ROOT / "jpeg.pdf"),
page_range,
"--output",
"./out.pdf",
]
)
captured = capsys.readouterr()
assert exit_code == 2, captured
assert "Invalid file path or page range provided" in captured.err


def test_rm_subset_warn_on_missing_pages(
capsys: CaptureFixture, tmp_path: Path
) -> None:
with chdir(tmp_path):
exit_code = run_cli(
[
"rm",
str(RESOURCES_ROOT / "jpeg.pdf"),
"2",
"--output",
"./out.pdf",
]
)
captured = capsys.readouterr()
assert exit_code == 0, captured
assert "WARN" in captured.err


def test_rm_subset_ensure_reduced_size(
tmp_path: Path, two_pages_pdf_filepath: Path
) -> None:
exit_code = run_cli(
[
"rm",
str(two_pages_pdf_filepath),
"0",
"--output",
str(tmp_path / "page1.pdf"),
]
)
assert exit_code == 0
# The extracted PDF should only contain ONE image:
embedded_images = extract_embedded_images(tmp_path / "page1.pdf")
assert len(embedded_images) == 1

exit_code = run_cli(
[
"rm",
str(two_pages_pdf_filepath),
"1",
"--output",
str(tmp_path / "page2.pdf"),
]
)
assert exit_code == 0
# The extracted PDF should only contain ONE image:
embedded_images = extract_embedded_images(tmp_path / "page2.pdf")
assert len(embedded_images) == 1


def test_rm_combine_files(
pdf_file_100: Path,
pdf_file_abc: Path,
tmp_path: Path,
capsys: CaptureFixture,
) -> None:
with chdir(tmp_path):
output_pdf_path = tmp_path / "out.pdf"

# Run pdfly rm command
exit_code = run_cli(
[
"rm",
str(pdf_file_100),
"1:10:2",
str(pdf_file_abc),
"::2",
str(pdf_file_abc),
"1::2",
"--output",
str(output_pdf_path),
]
)
captured = capsys.readouterr()

# Check if the command was successful
assert exit_code == 0, captured.out

# Extract text from the original and modified PDFs
extracted_pages = []
reader = PdfReader(output_pdf_path)
extracted_pages = [page.extract_text() for page in reader.pages]

# Compare the extracted text
l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))]
assert extracted_pages == l1 + [
"b",
"d",
"f",
"h",
"j",
"l",
"n",
"p",
"r",
"t",
"v",
"x",
"z",
"a",
"c",
"e",
"g",
"i",
"k",
"m",
"o",
"q",
"s",
"u",
"w",
"y",
]


@pytest.mark.parametrize(
("page_range", "expected"),
[
("22", [str(el) for el in range(100) if el != 22]),
("0:3", [str(el) for el in range(3, 100)]),
(":3", [str(el) for el in range(3, 100)]),
(":", []),
("5:", ["0", "1", "2", "3", "4"]),
("::2", [str(el) for el in list(range(100))[1::2]]),
(
"1:10:2",
[str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))],
),
("::1", []),
("::-1", []),
],
)
def test_rm_commands(
pdf_file_100: Path,
capsys: CaptureFixture,
tmp_path: Path,
page_range: str,
expected: List[str],
) -> None:
with chdir(tmp_path):
output_pdf_path = tmp_path / "out.pdf"

# Run pdfly rm command
exit_code = run_cli(
[
"rm",
str(pdf_file_100),
page_range,
"--output",
str(output_pdf_path),
]
)

# Check if the command was successful
assert exit_code == 0

# Extract text from the original and modified PDFs
extracted_pages = []
reader = PdfReader(output_pdf_path)
extracted_pages = [page.extract_text() for page in reader.pages]

# Compare the extracted text
assert extracted_pages == expected
Loading