diff --git a/changes.txt b/changes.txt index 1e3b79228..41c4f21f3 100644 --- a/changes.txt +++ b/changes.txt @@ -2,6 +2,18 @@ Change Log ========== +**Changes in version 1.23.23 (2024-02-14)** + +* Fixed issues: + + * **Fixed** `3150 `_: doc.select() hangs on this doc. + + +* Other: + + * Replaced major code portions previously supporting `Document.select()` MuPDF function `pdf_rearrange_pages()` which is faster and more thoroughly performing that task. + + **Changes in version 1.23.22 (2024-02-12)** * Fixed issues: diff --git a/src/__init__.py b/src/__init__.py index c40d77665..dd6d24a7e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -5385,16 +5385,21 @@ def select(self, pyliste): raise ValueError("is no PDF") if not hasattr(pyliste, "__getitem__"): raise ValueError("sequence required") - if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)): + + valid_range = range(len(self)) + if (len(pyliste) == 0 + or min(pyliste) not in valid_range + or max(pyliste) not in valid_range + ): raise ValueError("bad page number(s)") - # preparatory stuff: - # (1) get underlying pdf document, - # (2) transform Python list into integer array + + # get underlying pdf document, pdf = _as_pdf_document(self) - # call retainpages (code copy of fz_clean_file.c) - retainpages(pdf, pyliste) - if pdf.m_internal.rev_page_map: - mupdf.ll_pdf_drop_page_tree(pdf.m_internal) + + # create page sub-pdf via extra.rearrange_pages2 + extra.rearrange_pages2(pdf, tuple(pyliste)) + + # remove any existing pages with their kids self._reset_page_refs() def set_language(self, language=None): @@ -20862,116 +20867,6 @@ def repair_mono_font(page: "Page", font: "Font") -> None: log("Cannot set width for '%s' in xref %i" % (font.name, xref)) -def retainpage(doc, parent, kids, page): - ''' - Recreate page tree to only retain specified pages. - ''' - pageref = mupdf.pdf_lookup_page_obj(doc, page) - mupdf.pdf_flatten_inheritable_page_items(pageref) - mupdf.pdf_dict_put(pageref, PDF_NAME('Parent'), parent) - # Store page object in new kids array - mupdf.pdf_array_push(kids, pageref) - - -def retainpages(doc, liste): - ''' - This is called by PyMuPDF: - liste = page numbers to retain - ''' - argc = len(liste) - pagecount = mupdf.pdf_count_pages(doc) - - # Keep only pages/type and (reduced) dest entries to avoid - # references to dropped pages - oldroot = mupdf.pdf_dict_get(mupdf.pdf_trailer(doc), PDF_NAME('Root')) - pages = mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages')) - olddests = mupdf.pdf_load_name_tree(doc, PDF_NAME('Dests')) - outlines = mupdf.pdf_dict_get(oldroot, PDF_NAME('Outlines')) - ocproperties = mupdf.pdf_dict_get(oldroot, PDF_NAME('OCProperties')) - names_list = None - - root = mupdf.pdf_new_dict(doc, 3) - mupdf.pdf_dict_put(root, PDF_NAME('Type'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Type'))) - mupdf.pdf_dict_put(root, PDF_NAME('Pages'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages'))) - if outlines.m_internal: - mupdf.pdf_dict_put(root, PDF_NAME('Outlines'), outlines) - if ocproperties.m_internal: - mupdf.pdf_dict_put(root, PDF_NAME('OCProperties'), ocproperties) - - mupdf.pdf_update_object(doc, mupdf.pdf_to_num(oldroot), root) - - # Create a new kids array with only the pages we want to keep - kids = mupdf.pdf_new_array(doc, 1) - - # Retain pages specified - for page in range(argc): - i = liste[page] - if i < 0 or i >= pagecount: - RAISEPY(MSG_BAD_PAGENO, PyExc_ValueError) - retainpage(doc, pages, kids, i) - - # Update page count and kids array - countobj = mupdf.pdf_new_int(mupdf.pdf_array_len(kids)) - mupdf.pdf_dict_put(pages, PDF_NAME('Count'), countobj) - mupdf.pdf_dict_put(pages, PDF_NAME('Kids'), kids) - - pagecount = mupdf.pdf_count_pages(doc) - page_object_nums = [] - for i in range(pagecount): - pageref = mupdf.pdf_lookup_page_obj(doc, i) - page_object_nums.append(mupdf.pdf_to_num(pageref)) - - # If we had an old Dests tree (now reformed as an olddests dictionary), - # keep any entries in there that point to valid pages. - # This may mean we keep more than we need, but it is safe at least. - if olddests: - names = mupdf.pdf_new_dict(doc, 1) - dests = mupdf.pdf_new_dict(doc, 1) - len_ = mupdf.pdf_dict_len(olddests) - - names_list = mupdf.pdf_new_array(doc, 32) - - for i in range(len_): - key = mupdf.pdf_dict_get_key(olddests, i) - val = mupdf.pdf_dict_get_val(olddests, i) - dest = mupdf.pdf_dict_get(val, PDF_NAME('D')) - - dest = mupdf.pdf_array_get(dest if dest.m_internal else val, 0) - # fixme: need dest_is_valid_page. - if dest_is_valid_page(dest, page_object_nums, pagecount): - key_str = mupdf.pdf_new_string(mupdf.pdf_to_name(key), len(mupdf.pdf_to_name(key))) - mupdf.pdf_array_push(names_list, key_str) - mupdf.pdf_array_push(names_list, val) - - mupdf.pdf_dict_put(dests, PDF_NAME('Names'), names_list) - mupdf.pdf_dict_put(names, PDF_NAME('Dests'), dests) - mupdf.pdf_dict_put(root, PDF_NAME('Names'), names) - - # Edit each pages /Annot list to remove any links pointing to nowhere. - for i in range(pagecount): - pageref = mupdf.pdf_lookup_page_obj(doc, i) - annots = mupdf.pdf_dict_get(pageref, PDF_NAME('Annots')) - len_ = mupdf.pdf_array_len(annots) - j = 0 - while 1: - if j >= len_: - break - o = mupdf.pdf_array_get(annots, j) - - if not mupdf.pdf_name_eq(mupdf.pdf_dict_get(o, PDF_NAME('Subtype')), PDF_NAME('Link')): - continue - - if not dest_is_valid(o, pagecount, page_object_nums, names_list): - # Remove this annotation - mupdf.pdf_array_delete(annots, j) - len_ -= 1 - j -= 1 - j += 1 - - if strip_outlines( doc, outlines, pagecount, page_object_nums, names_list) == 0: - mupdf.pdf_dict_del(root, PDF_NAME('Outlines')) - - def sRGB_to_pdf(srgb: int) -> tuple: """Convert sRGB color code to a PDF color triple. diff --git a/src/extra.i b/src/extra.i index ef39f0ad0..dd3dc73c9 100644 --- a/src/extra.i +++ b/src/extra.i @@ -166,6 +166,21 @@ PyObject* JM_EscapeStrFromBuffer(fz_buffer* buff) return val; } +void rearrange_pages2( + mupdf::PdfDocument& doc, + PyObject *new_pages + ) +{ + int len = (int) PyTuple_Size(new_pages); + int *pages = (int *) malloc((int) len * sizeof(int)); + int i; + for (i = 0; i < len; i++) { + pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(new_pages, (Py_ssize_t) i)); + } + mupdf::pdf_rearrange_pages(doc, len, pages); + free(pages); +} + //---------------------------------------------------------------------------- // Deep-copies a source page to the target. @@ -4515,3 +4530,5 @@ fz_image* fz_new_image_from_compressed_buffer( fz_compressed_buffer *buffer, fz_image *mask ); + +void rearrange_pages2( mupdf::PdfDocument& doc, PyObject *new_pages); diff --git a/src_classic/fitz_old.i b/src_classic/fitz_old.i index 4da74e788..2afbe62dd 100644 --- a/src_classic/fitz_old.i +++ b/src_classic/fitz_old.i @@ -2297,7 +2297,8 @@ if not self.is_pdf: if not hasattr(pyliste, "__getitem__"): raise ValueError("sequence required") if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)): - raise ValueError("bad page number(s)")%} + raise ValueError("bad page number(s)") +pyliste = tuple(pyliste)%} %pythonappend select %{self._reset_page_refs()%} PyObject *select(PyObject *pyliste) { @@ -2306,17 +2307,23 @@ if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not // (2) transform Python list into integer array pdf_document *pdf = pdf_specifics(gctx, (fz_document *) $self); + int *pages = NULL; fz_try(gctx) { // call retainpages (code copy of fz_clean_file.c) - globals glo = {0}; - glo.ctx = gctx; - glo.doc = pdf; - retainpages(gctx, &glo, pyliste); + int i, len = (int) PyTuple_Size(pyliste); + pages = fz_realloc_array(gctx, pages, len, int); + for (i = 0; i < len; i++) { + pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(pyliste, (Py_ssize_t) i)); + } + pdf_rearrange_pages(gctx, pdf, len, pages); if (pdf->rev_page_map) { pdf_drop_page_tree(gctx, pdf); } } + fz_always(gctx) { + fz_free(gctx, pages); + } fz_catch(gctx) { return NULL; } diff --git a/src_classic/helper-select.i b/src_classic/helper-select.i index 33400dfb2..2a547649e 100644 --- a/src_classic/helper-select.i +++ b/src_classic/helper-select.i @@ -9,329 +9,6 @@ # maintained and developed by Artifex Software, Inc. https://artifex.com. # ------------------------------------------------------------------------ */ -//---------------------------------------------------------------------------- -// Helpers for document page selection - main logic was imported -// from pdf_clean_file.c. But instead of analyzing a string-based spec of -// selected pages, we accept a Python sequence. -//---------------------------------------------------------------------------- -typedef struct globals_s -{ - pdf_document *doc; - fz_context *ctx; -} globals; - -int string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) -{ - int n = pdf_array_len(ctx, names_list); - int i; - const char *str = pdf_to_text_string(ctx, p); - - for (i = 0; i < n ; i += 2) - { - if (!strcmp(pdf_to_text_string(ctx, pdf_array_get(ctx, names_list, i)), str)) - return 1; - } - return 0; -} - -//---------------------------------------------------------------------------- -// Recreate page tree to only retain specified pages. -//---------------------------------------------------------------------------- -void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) -{ - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page); - - pdf_flatten_inheritable_page_items(ctx, pageref); - - pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parent); - - /* Store page object in new kids array */ - pdf_array_push(ctx, kids, pageref); -} - -int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount) -{ - int i; - int num = pdf_to_num(ctx, obj); - - if (num == 0) - return 0; - for (i = 0; i < pagecount; i++) - { - if (page_object_nums[i] == num) - return 1; - } - return 0; -} - -int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list) -{ - pdf_obj *p; - - p = pdf_dict_get(ctx, o, PDF_NAME(A)); - if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)) && - !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME(D)), names_list)) - return 0; - - p = pdf_dict_get(ctx, o, PDF_NAME(Dest)); - if (p == NULL) - {} - else if (pdf_is_string(ctx, p)) - { - return string_in_names_list(ctx, p, names_list); - } - else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count)) - return 0; - - return 1; -} - -int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list); - -int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast) -{ - pdf_obj *prev = NULL; - pdf_obj *first = NULL; - pdf_obj *current; - int count = 0; - - for (current = outlines; current != NULL; ) - { - int nc; - - /*********************************************************************/ - // Strip any children to start with. This takes care of - // First / Last / Count for us. - /*********************************************************************/ - nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list); - - if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list)) - { - if (nc == 0) - { - /*************************************************************/ - // Outline with invalid dest and no children. Drop it by - // pulling the next one in here. - /*************************************************************/ - pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next)); - if (next == NULL) - { - // There is no next one to pull in - if (prev != NULL) - pdf_dict_del(ctx, prev, PDF_NAME(Next)); - } - else if (prev != NULL) - { - pdf_dict_put(ctx, prev, PDF_NAME(Next), next); - pdf_dict_put(ctx, next, PDF_NAME(Prev), prev); - } - else - { - pdf_dict_del(ctx, next, PDF_NAME(Prev)); - } - current = next; - } - else - { - // Outline with invalid dest, but children. Just drop the dest. - pdf_dict_del(ctx, current, PDF_NAME(Dest)); - pdf_dict_del(ctx, current, PDF_NAME(A)); - current = pdf_dict_get(ctx, current, PDF_NAME(Next)); - } - } - else - { - // Keep this one - if (first == NULL) - first = current; - prev = current; - current = pdf_dict_get(ctx, current, PDF_NAME(Next)); - count++; - } - } - - *pfirst = first; - *plast = prev; - - return count; -} - -int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list) -{ - int nc; - pdf_obj *first; - pdf_obj *last; - - if (outlines == NULL) - return 0; - - first = pdf_dict_get(ctx, outlines, PDF_NAME(First)); - if (first == NULL) - nc = 0; - else - nc = strip_outline(ctx, doc, first, page_count, page_object_nums, - names_list, &first, &last); - - if (nc == 0) - { - pdf_dict_del(ctx, outlines, PDF_NAME(First)); - pdf_dict_del(ctx, outlines, PDF_NAME(Last)); - pdf_dict_del(ctx, outlines, PDF_NAME(Count)); - } - else - { - int old_count = pdf_to_int(ctx, pdf_dict_get(ctx, outlines, PDF_NAME(Count))); - pdf_dict_put(ctx, outlines, PDF_NAME(First), first); - pdf_dict_put(ctx, outlines, PDF_NAME(Last), last); - pdf_dict_put_drop(ctx, outlines, PDF_NAME(Count), pdf_new_int(ctx, old_count > 0 ? nc : -nc)); - } - return nc; -} - -//---------------------------------------------------------------------------- -// This is called by PyMuPDF: -// liste = page numbers to retain -//---------------------------------------------------------------------------- -void retainpages(fz_context *ctx, globals *glo, PyObject *liste) -{ - pdf_obj *oldroot, *root, *pages, *kids, *countobj, *olddests; - Py_ssize_t argc = PySequence_Size(liste); - pdf_document *doc = glo->doc; - pdf_obj *names_list = NULL; - pdf_obj *outlines; - pdf_obj *ocproperties; - int pagecount = pdf_count_pages(ctx, doc); - - int i; - int *page_object_nums; - -/******************************************************************************/ -// Keep only pages/type and (reduced) dest entries to avoid -// references to dropped pages -/******************************************************************************/ - oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)); - pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)); - olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests)); - outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines)); - ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties)); - - root = pdf_new_dict(ctx, doc, 3); - pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type))); - pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages))); - if (outlines) - pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines); - if (ocproperties) - pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties); - - pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); - - // Create a new kids array with only the pages we want to keep - kids = pdf_new_array(ctx, doc, 1); - - // Retain pages specified - Py_ssize_t page; - fz_try(ctx) { - for (page = 0; page < argc; page++) { - i = (int) PyInt_AsLong(PySequence_ITEM(liste, page)); - if (i < 0 || i >= pagecount) { - RAISEPY(ctx, MSG_BAD_PAGENO, PyExc_ValueError); - } - retainpage(ctx, doc, pages, kids, i); - } - } - fz_catch(ctx) { - fz_rethrow(ctx); - } - - // Update page count and kids array - countobj = pdf_new_int(ctx, pdf_array_len(ctx, kids)); - pdf_dict_put_drop(ctx, pages, PDF_NAME(Count), countobj); - pdf_dict_put_drop(ctx, pages, PDF_NAME(Kids), kids); - - pagecount = pdf_count_pages(ctx, doc); - page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums)); - for (i = 0; i < pagecount; i++) - { - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); - page_object_nums[i] = pdf_to_num(ctx, pageref); - } - -/******************************************************************************/ -// If we had an old Dests tree (now reformed as an olddests dictionary), -// keep any entries in there that point to valid pages. -// This may mean we keep more than we need, but it is safe at least. -/******************************************************************************/ - if (olddests) - { - pdf_obj *names = pdf_new_dict(ctx, doc, 1); - pdf_obj *dests = pdf_new_dict(ctx, doc, 1); - int len = pdf_dict_len(ctx, olddests); - - names_list = pdf_new_array(ctx, doc, 32); - - for (i = 0; i < len; i++) - { - pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); - pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); - pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D)); - - dest = pdf_array_get(ctx, dest ? dest : val, 0); - if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount)) - { - pdf_obj *key_str = pdf_new_string(ctx, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); - pdf_array_push_drop(ctx, names_list, key_str); - pdf_array_push(ctx, names_list, val); - } - } - - pdf_dict_put(ctx, dests, PDF_NAME(Names), names_list); - pdf_dict_put(ctx, names, PDF_NAME(Dests), dests); - pdf_dict_put(ctx, root, PDF_NAME(Names), names); - - pdf_drop_obj(ctx, names); - pdf_drop_obj(ctx, dests); - pdf_drop_obj(ctx, olddests); - } - -/*****************************************************************************/ -// Edit each pages /Annot list to remove any links pointing to nowhere. -/*****************************************************************************/ - for (i = 0; i < pagecount; i++) - { - pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i); - - pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots)); - - int len = pdf_array_len(ctx, annots); - int j; - - for (j = 0; j < len; j++) - { - pdf_obj *o = pdf_array_get(ctx, annots, j); - - if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link))) - continue; - - if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list)) - { - // Remove this annotation - pdf_array_delete(ctx, annots, j); - len--; - j--; - } - } - } - - if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0) - { - pdf_dict_del(ctx, root, PDF_NAME(Outlines)); - } - - fz_free(ctx, page_object_nums); - pdf_drop_obj(ctx, names_list); - pdf_drop_obj(ctx, root); -} - void remove_dest_range(fz_context *ctx, pdf_document *pdf, PyObject *numbers) { fz_try(ctx) { diff --git a/tests/resources/test-3150.pdf b/tests/resources/test-3150.pdf new file mode 100644 index 000000000..24102aaf3 Binary files /dev/null and b/tests/resources/test-3150.pdf differ diff --git a/tests/test_pagedelete.py b/tests/test_pagedelete.py index fd3b040e6..b65a719df 100644 --- a/tests/test_pagedelete.py +++ b/tests/test_pagedelete.py @@ -13,10 +13,12 @@ - the remaining TOC items still point to the correct page - the document has no more links at all """ + import os import fitz +scriptdir = os.path.dirname(__file__) page_count = 100 # initial document length r = range(5, 35, 5) # contains page numbers we will delete # insert this link on pages after first deleted one @@ -70,8 +72,22 @@ def test_deletion(): doc.move_page(0) doc.fullcopy_page(0) + def test_3094(): - path = os.path.abspath(f'{__file__}/../../tests/resources/test_2871.pdf') + path = os.path.abspath(f"{__file__}/../../tests/resources/test_2871.pdf") document = fitz.open(path) pnos = [i for i in range(0, document.page_count, 2)] document.delete_pages(pnos) + + +def test_3150(): + """Assert correct functioning for problem file. + + Implicitely also check use of new MuPDF function + pdf_rearrange_pages() since version 1.23.9. + """ + filename = os.path.join(scriptdir, "resources", "test-3150.pdf") + pages = [3, 3, 3, 2, 3, 1, 0, 0] + doc = fitz.open(filename) + doc.select(pages) + assert doc.page_count == len(pages)