Skip to content

Commit

Permalink
Merge branch 'kovidgoyal:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
cbhaley authored Sep 18, 2024
2 parents fe2fbfa + cea3db2 commit 56531fe
Showing 1 changed file with 53 additions and 151 deletions.
204 changes: 53 additions & 151 deletions src/calibre/srv/render_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
import json
import os
import sys
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from functools import partial
from itertools import count
from math import ceil

from lxml.etree import Comment

from calibre import detect_ncpus, force_unicode, prepare_string_for_xml
from calibre.constants import iswindows
from calibre.customize.ui import plugin_for_input_format
from calibre.ebooks.oeb.base import EPUB, OEB_DOCS, OEB_STYLES, OPF, SMIL, XHTML, XHTML_NS, XLINK, rewrite_links, urlunquote
from calibre.ebooks.oeb.base import XPath as _XPath
Expand All @@ -24,14 +22,10 @@
from calibre.ebooks.oeb.polish.cover import find_cover_image, find_cover_image_in_page, find_cover_page
from calibre.ebooks.oeb.polish.toc import from_xpaths, get_landmarks, get_toc
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.srv.metadata import encode_datetime
from calibre.srv.opts import grouper
from calibre.utils.date import EPOCH
from calibre.utils.filenames import rmtree
from calibre.utils.ipc.simple_worker import start_pipe_worker
from calibre.utils.logging import default_log
from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
from calibre.utils.serialize import json_dumps, json_loads, msgpack_loads
from calibre.utils.short_uuid import uuid4
from calibre_extensions.fast_css_transform import transform_properties
from polyglot.binary import as_base64_unicode as encode_component
Expand Down Expand Up @@ -501,102 +495,6 @@ def handle_link(a, attr='href'):
f.write(shtml)


class RenderManager:

def __init__(self, max_workers):
self.max_workers = max_workers

def launch_worker(self):
with open(os.path.join(self.tdir, f'{len(self.workers)}.json'), 'wb') as output:
error = open(os.path.join(self.tdir, f'{len(self.workers)}.error'), 'wb')
p = start_pipe_worker('from calibre.srv.render_book import worker_main; worker_main()', stdout=error, stderr=error)
p.output_path = output.name
p.error_path = error.name
self.workers.append(p)

def __enter__(self):
self.workers = []
self.tdir = PersistentTemporaryDirectory()
return self

def __exit__(self, *a):
while self.workers:
p = self.workers.pop()
if p.poll() is not None:
continue
p.terminate()
if not iswindows and p.poll() is None:
time.sleep(0.02)
if p.poll() is None:
p.kill()
del self.workers
try:
rmtree(self.tdir)
except OSError:
time.sleep(0.1)
try:
rmtree(self.tdir)
except OSError:
pass
del self.tdir

def launch_workers(self, names, in_process_container):
num_workers = min(detect_ncpus(), len(names))
if self.max_workers:
num_workers = min(num_workers, self.max_workers)
if num_workers > 1:
if len(names) < 3 or sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names) < 128 * 1024:
num_workers = 1
if num_workers > 1:
num_other_workers = num_workers - 1
while len(self.workers) < num_other_workers:
self.launch_worker()
return num_workers

def __call__(self, names, args, in_process_container):
num_workers = len(self.workers) + 1
if num_workers == 1:
return [process_book_files(names, *args, container=in_process_container)]

group_sz = int(ceil(len(names) / num_workers))
groups = tuple(grouper(group_sz, names))
for group, worker in zip(groups[:-1], self.workers):
worker.stdin.write(as_bytes(msgpack_dumps((worker.output_path, group,) + args)))
worker.stdin.flush(), worker.stdin.close()
worker.job_sent = True

for worker in self.workers:
if not hasattr(worker, 'job_sent'):
worker.stdin.write(b'_'), worker.stdin.flush(), worker.stdin.close()

error = None
results = [process_book_files(groups[-1], *args, container=in_process_container)]
for worker in self.workers:
if not hasattr(worker, 'job_sent'):
worker.wait()
continue
if worker.wait() != 0:
with open(worker.error_path, 'rb') as f:
error = f.read().decode('utf-8', 'replace')
else:
with open(worker.output_path, 'rb') as f:
results.append(msgpack_loads(f.read()))
if error is not None:
raise Exception('Render worker failed with error:\n' + error)
return results


def worker_main():
stdin = getattr(sys.stdin, 'buffer', sys.stdin)
raw = stdin.read()
if raw == b'_':
return
args = msgpack_loads(raw)
result = process_book_files(*args[1:])
with open(args[0], 'wb') as f:
f.write(as_bytes(msgpack_dumps(result)))


def virtualize_html(container, name, link_uid, link_to_map, virtualized_names):

changed = set()
Expand Down Expand Up @@ -634,7 +532,7 @@ def handle_link(a, attr='href'):
__smil_file_names__ = ''


def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone, container=None):
def process_book_files(names, container_dir, opfpath, virtualize_resources, link_uid, data_for_clone=None, container=None):
if container is None:
container = SimpleContainer(container_dir, opfpath, default_log, clone_data=data_for_clone)
container.cloned = False
Expand Down Expand Up @@ -664,9 +562,19 @@ def process_book_files(names, container_dir, opfpath, virtualize_resources, link
return link_to_map, html_data, virtualized_names, smil_map


def calculate_number_of_workers(names, in_process_container, max_workers):
num_workers = min(detect_ncpus(), len(names))
if max_workers:
num_workers = min(num_workers, max_workers)
if num_workers > 1:
if len(names) < 3 or sum(os.path.getsize(in_process_container.name_path_map[n]) for n in names) < 128 * 1024:
num_workers = 1
return num_workers


def process_exploded_book(
book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False,
book_metadata=None, virtualize_resources=True
book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False,
book_metadata=None, virtualize_resources=True, max_workers=1
):
log = log or default_log
container = SimpleContainer(tdir, opfpath, log)
Expand All @@ -676,15 +584,8 @@ def process_exploded_book(
def needs_work(mt):
return mt in OEB_STYLES or mt in OEB_DOCS or mt in ('image/svg+xml', 'application/smil', 'application/smil+xml')

def work_priority(name):
# ensure workers with large files or stylesheets
# have the less names
size = os.path.getsize(container.name_path_map[name]),
is_html = container.mime_map.get(name) in OEB_DOCS
return (0 if is_html else 1), size

if not is_comic:
render_manager.launch_workers(tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt)), container)
names_that_need_work = tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt))
num_workers = calculate_number_of_workers(names_that_need_work, container, max_workers)

bookmark_data = None
if save_bookmark_data:
Expand Down Expand Up @@ -741,15 +642,17 @@ def work_priority(name):
'page_list_anchor_map': pagelist_anchor_map(page_list),
}

names = sorted(
(n for n, mt in iteritems(container.mime_map) if needs_work(mt)),
key=work_priority)
results = []
if num_workers < 2:
results.append(process_book_files(names_that_need_work, tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container=container))
else:
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = tuple(
executor.submit(process_book_files, (name,), tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container=container)
for name in names_that_need_work)
for future in futures:
results.append(future.result())

results = render_manager(
names, (
tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()
), container
)
ltm = book_render_data['link_to_map']
html_data = {}
virtualized_names = set()
Expand Down Expand Up @@ -899,33 +802,32 @@ def get_stored_annotations(container, bookmark_data):

def render(pathtoebook, output_dir, book_hash=None, serialize_metadata=False, extract_annotations=False, virtualize_resources=True, max_workers=1):
pathtoebook = os.path.abspath(pathtoebook)
with RenderManager(max_workers) as render_manager:
mi = None
if serialize_metadata:
from calibre.customize.ui import quick_metadata
from calibre.ebooks.metadata.meta import get_metadata
with open(pathtoebook, 'rb') as f, quick_metadata:
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
container, bookmark_data = process_exploded_book(
book_fmt, opfpath, input_fmt, output_dir, render_manager,
book_hash=book_hash, save_bookmark_data=extract_annotations,
book_metadata=mi, virtualize_resources=virtualize_resources
)
if serialize_metadata:
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
d = metadata_as_dict(mi)
d.pop('cover_data', None)
serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
with open(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
f.write(json_dumps(d))
if extract_annotations:
annotations = None
if bookmark_data:
annotations = json_dumps(tuple(get_stored_annotations(container, bookmark_data)))
if annotations:
with open(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
f.write(annotations)
mi = None
if serialize_metadata:
from calibre.customize.ui import quick_metadata
from calibre.ebooks.metadata.meta import get_metadata
with open(pathtoebook, 'rb') as f, quick_metadata:
mi = get_metadata(f, os.path.splitext(pathtoebook)[1][1:].lower())
book_fmt, opfpath, input_fmt = extract_book(pathtoebook, output_dir, log=default_log)
container, bookmark_data = process_exploded_book(
book_fmt, opfpath, input_fmt, output_dir, max_workers=max_workers,
book_hash=book_hash, save_bookmark_data=extract_annotations,
book_metadata=mi, virtualize_resources=virtualize_resources
)
if serialize_metadata:
from calibre.ebooks.metadata.book.serialize import metadata_as_dict
d = metadata_as_dict(mi)
d.pop('cover_data', None)
serialize_datetimes(d), serialize_datetimes(d.get('user_metadata', {}))
with open(os.path.join(output_dir, 'calibre-book-metadata.json'), 'wb') as f:
f.write(json_dumps(d))
if extract_annotations:
annotations = None
if bookmark_data:
annotations = json_dumps(tuple(get_stored_annotations(container, bookmark_data)))
if annotations:
with open(os.path.join(output_dir, 'calibre-book-annotations.json'), 'wb') as f:
f.write(annotations)


def render_for_viewer(path, out_dir, book_hash):
Expand Down

0 comments on commit 56531fe

Please sign in to comment.