Re-Implement select method

This re-implements Document method "select()" based on new MuPDF function "pdf_rearrange_pages()". This is a more complete (and faster) implementation of what needs to be done here in that not only pages will be rearranged, but also consequential changes will be made to the table of contents, links to removed pages and affected entries in the Optional Content definitions. Update __init__.py
pymupdf · Feb 17, 2024 · ff1bdbb · ff1bdbb
1 parent 0fd9594
commit ff1bdbb
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 447 deletions.
diff --git a/changes.txt b/changes.txt
@@ -2,6 +2,18 @@ Change Log
 ==========
 
 
+**Changes in version 1.23.23 (2024-02-14)**
+
+* Fixed issues:
+
+ * **Fixed** `3150 <https://github.com/pymupdf/PyMuPDF/issues/3150>`_: doc.select() hangs on this doc.
+
+
+* Other:
+
+ * Replaced major code portions previously supporting `Document.select()` MuPDF function `pdf_rearrange_pages()` which is faster and more thoroughly performing that task.
+
+
 **Changes in version 1.23.22 (2024-02-12)**
 
 * Fixed issues:

diff --git a/src/__init__.py b/src/__init__.py
@@ -5385,16 +5385,21 @@ def select(self, pyliste):
  raise ValueError("is no PDF")
  if not hasattr(pyliste, "__getitem__"):
  raise ValueError("sequence required")
- if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)):
+
+ valid_range = range(len(self))
+ if (len(pyliste) == 0
+ or min(pyliste) not in valid_range
+ or max(pyliste) not in valid_range
+ ):
  raise ValueError("bad page number(s)")
- # preparatory stuff:
- # (1) get underlying pdf document,
- # (2) transform Python list into integer array
+
+ # get underlying pdf document,
  pdf = _as_pdf_document(self)
- # call retainpages (code copy of fz_clean_file.c)
- retainpages(pdf, pyliste)
- if pdf.m_internal.rev_page_map:
- mupdf.ll_pdf_drop_page_tree(pdf.m_internal)
+
+ # create page sub-pdf via extra.rearrange_pages2
+ extra.rearrange_pages2(pdf, tuple(pyliste))
+
+ # remove any existing pages with their kids
  self._reset_page_refs()
 
  def set_language(self, language=None):
@@ -20862,116 +20867,6 @@ def repair_mono_font(page: "Page", font: "Font") -> None:
  log("Cannot set width for '%s' in xref %i" % (font.name, xref))
 
 
-def retainpage(doc, parent, kids, page):
- '''
- Recreate page tree to only retain specified pages.
- '''
- pageref = mupdf.pdf_lookup_page_obj(doc, page)
- mupdf.pdf_flatten_inheritable_page_items(pageref)
- mupdf.pdf_dict_put(pageref, PDF_NAME('Parent'), parent)
- # Store page object in new kids array
- mupdf.pdf_array_push(kids, pageref)
-
-
-def retainpages(doc, liste):
- '''
- This is called by PyMuPDF:
- liste = page numbers to retain
- '''
- argc = len(liste)
- pagecount = mupdf.pdf_count_pages(doc)
-
- # Keep only pages/type and (reduced) dest entries to avoid
- # references to dropped pages
- oldroot = mupdf.pdf_dict_get(mupdf.pdf_trailer(doc), PDF_NAME('Root'))
- pages = mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages'))
- olddests = mupdf.pdf_load_name_tree(doc, PDF_NAME('Dests'))
- outlines = mupdf.pdf_dict_get(oldroot, PDF_NAME('Outlines'))
- ocproperties = mupdf.pdf_dict_get(oldroot, PDF_NAME('OCProperties'))
- names_list = None
-
- root = mupdf.pdf_new_dict(doc, 3)
- mupdf.pdf_dict_put(root, PDF_NAME('Type'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Type')))
- mupdf.pdf_dict_put(root, PDF_NAME('Pages'), mupdf.pdf_dict_get(oldroot, PDF_NAME('Pages')))
- if outlines.m_internal:
- mupdf.pdf_dict_put(root, PDF_NAME('Outlines'), outlines)
- if ocproperties.m_internal:
- mupdf.pdf_dict_put(root, PDF_NAME('OCProperties'), ocproperties)
-
- mupdf.pdf_update_object(doc, mupdf.pdf_to_num(oldroot), root)
-
- # Create a new kids array with only the pages we want to keep
- kids = mupdf.pdf_new_array(doc, 1)
-
- # Retain pages specified
- for page in range(argc):
- i = liste[page]
- if i < 0 or i >= pagecount:
- RAISEPY(MSG_BAD_PAGENO, PyExc_ValueError)
- retainpage(doc, pages, kids, i)
-
- # Update page count and kids array
- countobj = mupdf.pdf_new_int(mupdf.pdf_array_len(kids))
- mupdf.pdf_dict_put(pages, PDF_NAME('Count'), countobj)
- mupdf.pdf_dict_put(pages, PDF_NAME('Kids'), kids)
-
- pagecount = mupdf.pdf_count_pages(doc)
- page_object_nums = []
- for i in range(pagecount):
- pageref = mupdf.pdf_lookup_page_obj(doc, i)
- page_object_nums.append(mupdf.pdf_to_num(pageref))
-
- # If we had an old Dests tree (now reformed as an olddests dictionary),
- # keep any entries in there that point to valid pages.
- # This may mean we keep more than we need, but it is safe at least.
- if olddests:
- names = mupdf.pdf_new_dict(doc, 1)
- dests = mupdf.pdf_new_dict(doc, 1)
- len_ = mupdf.pdf_dict_len(olddests)
-
- names_list = mupdf.pdf_new_array(doc, 32)
-
- for i in range(len_):
- key = mupdf.pdf_dict_get_key(olddests, i)
- val = mupdf.pdf_dict_get_val(olddests, i)
- dest = mupdf.pdf_dict_get(val, PDF_NAME('D'))
-
- dest = mupdf.pdf_array_get(dest if dest.m_internal else val, 0)
- # fixme: need dest_is_valid_page.
- if dest_is_valid_page(dest, page_object_nums, pagecount):
- key_str = mupdf.pdf_new_string(mupdf.pdf_to_name(key), len(mupdf.pdf_to_name(key)))
- mupdf.pdf_array_push(names_list, key_str)
- mupdf.pdf_array_push(names_list, val)
-
- mupdf.pdf_dict_put(dests, PDF_NAME('Names'), names_list)
- mupdf.pdf_dict_put(names, PDF_NAME('Dests'), dests)
- mupdf.pdf_dict_put(root, PDF_NAME('Names'), names)
-
- # Edit each pages /Annot list to remove any links pointing to nowhere.
- for i in range(pagecount):
- pageref = mupdf.pdf_lookup_page_obj(doc, i)
- annots = mupdf.pdf_dict_get(pageref, PDF_NAME('Annots'))
- len_ = mupdf.pdf_array_len(annots)
- j = 0
- while 1:
- if j >= len_:
- break
- o = mupdf.pdf_array_get(annots, j)
-
- if not mupdf.pdf_name_eq(mupdf.pdf_dict_get(o, PDF_NAME('Subtype')), PDF_NAME('Link')):
- continue
-
- if not dest_is_valid(o, pagecount, page_object_nums, names_list):
- # Remove this annotation
- mupdf.pdf_array_delete(annots, j)
- len_ -= 1
- j -= 1
- j += 1
-
- if strip_outlines( doc, outlines, pagecount, page_object_nums, names_list) == 0:
- mupdf.pdf_dict_del(root, PDF_NAME('Outlines'))
-
-
 def sRGB_to_pdf(srgb: int) -> tuple:
  """Convert sRGB color code to a PDF color triple.
 

diff --git a/src/extra.i b/src/extra.i
@@ -166,6 +166,21 @@ PyObject* JM_EscapeStrFromBuffer(fz_buffer* buff)
  return val;
 }
 
+void rearrange_pages2(
+ mupdf::PdfDocument& doc,
+ PyObject *new_pages
+ )
+{
+ int len = (int) PyTuple_Size(new_pages);
+ int *pages = (int *) malloc((int) len * sizeof(int));
+ int i;
+ for (i = 0; i < len; i++) {
+ pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(new_pages, (Py_ssize_t) i));
+ }
+ mupdf::pdf_rearrange_pages(doc, len, pages);
+ free(pages);
+}
+
 
 //----------------------------------------------------------------------------
 // Deep-copies a source page to the target.
@@ -4515,3 +4530,5 @@ fz_image* fz_new_image_from_compressed_buffer(
  fz_compressed_buffer *buffer,
  fz_image *mask
  );
+
+void rearrange_pages2( mupdf::PdfDocument& doc, PyObject *new_pages);
diff --git a/src_classic/fitz_old.i b/src_classic/fitz_old.i
@@ -2297,7 +2297,8 @@ if not self.is_pdf:
 if not hasattr(pyliste, "__getitem__"):
  raise ValueError("sequence required")
 if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not in range(len(self)):
- raise ValueError("bad page number(s)")%}
+ raise ValueError("bad page number(s)")
+pyliste = tuple(pyliste)%}
  %pythonappend select %{self._reset_page_refs()%}
  PyObject *select(PyObject *pyliste)
  {
@@ -2306,17 +2307,23 @@ if len(pyliste) == 0 or min(pyliste) not in range(len(self)) or max(pyliste) not
  // (2) transform Python list into integer array
 
  pdf_document *pdf = pdf_specifics(gctx, (fz_document *) $self);
+ int *pages = NULL;
  fz_try(gctx) {
  // call retainpages (code copy of fz_clean_file.c)
- globals glo = {0};
- glo.ctx = gctx;
- glo.doc = pdf;
- retainpages(gctx, &glo, pyliste);
+ int i, len = (int) PyTuple_Size(pyliste);
+ pages = fz_realloc_array(gctx, pages, len, int);
+ for (i = 0; i < len; i++) {
+ pages[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(pyliste, (Py_ssize_t) i));
+ }
+ pdf_rearrange_pages(gctx, pdf, len, pages);
  if (pdf->rev_page_map)
  {
  pdf_drop_page_tree(gctx, pdf);
  }
  }
+ fz_always(gctx) {
+ fz_free(gctx, pages);
+ }
  fz_catch(gctx) {
  return NULL;
  }