Skip to content

Commit

Permalink
Speedup counting of chars in tags
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Sep 18, 2024
1 parent df6e586 commit 07037dd
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 43 deletions.
17 changes: 7 additions & 10 deletions src/calibre/srv/render_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,17 @@
from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
from calibre.utils.short_uuid import uuid4
from calibre_extensions.fast_css_transform import transform_properties
from calibre_extensions.speedup import get_element_char_length
from polyglot.binary import as_base64_unicode as encode_component
from polyglot.binary import from_base64_bytes
from polyglot.binary import from_base64_unicode as decode_component
from polyglot.builtins import as_bytes, iteritems
from polyglot.urllib import quote, urlparse

try:
from calibre_extensions.speedup import get_num_of_significant_chars
except ImportError: # running from source without updated binary
def get_num_of_significant_chars(elem):
return len(getattr(elem, 'text', '')) + len(getattr(elem, 'tail', ''))
RENDER_VERSION = 1

BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9' # noqa
Expand Down Expand Up @@ -142,17 +146,10 @@ def anchor_map(root):

def get_length(root):
ans = 0

def count(elem):
tag = getattr(elem, 'tag', count)
if callable(tag):
return get_element_char_length('', None, getattr(elem, 'tail', None))
return get_element_char_length(tag, elem.text, elem.tail)

for body in root.iterchildren(XHTML('body')):
ans += count(body)
ans += get_num_of_significant_chars(body)
for elem in body.iterdescendants():
ans += count(elem)
ans += get_num_of_significant_chars(elem)
return ans


Expand Down
14 changes: 14 additions & 0 deletions src/calibre/srv/tests/fast_css_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@

class TestTransform(SimpleTest):

def test_counting_chars_in_elems(self):
from lxml import etree

from calibre.ebooks.oeb.polish.parsing import parse
from calibre.srv.render_book import get_length
def t(html, expected):
root = parse(html, force_html5_parse=True)
self.assertEqual(expected, get_length(root), etree.tostring(root, encoding=str))
t('<p>abc<span>def</span>x yz<svg>howdy', 1014)
t('<p>abc<span>def</span>x yz', 9)
t('<p>abc<span>def</span><script>x yz', 6)
t('<p>abc<span>def</span><style>x yz', 6)
t('<p>abc<span>def</span>x yz<img>howdy', 1014)

def test_number_parsing(self):
for x in '.314 -.314 0.314 0 2 +2 -1 1e2 -3.14E+2 2e-2'.split():
self.ae(parse_css_number(x), ast.literal_eval(x))
Expand Down
102 changes: 69 additions & 33 deletions src/calibre/utils/speedup.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,43 +489,79 @@ set_thread_name(PyObject *self, PyObject *args) {

#define char_is_ignored(ch) (ch <= 32)

typedef struct udata {
void *data; int kind; Py_ssize_t len;
} udata;

static size_t
count_chars_in(PyObject *text) {
size_t ans = 0;
if (PyUnicode_READY(text) != 0) return 0;
int kind = PyUnicode_KIND(text);
void *data = PyUnicode_DATA(text);
Py_ssize_t len = PyUnicode_GET_LENGTH(text);
ans = len;
for (Py_ssize_t i = 0; i < len; i++) {
if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
}
count_chars_in(udata *text) {
size_t ans = text->len;
for (Py_ssize_t i = 0; i < text->len; i++) if (char_is_ignored(PyUnicode_READ(text->kind, text->data, i))) ans--;
return ans;
}

static PyObject*
get_element_char_length(PyObject *self, PyObject *args) {
(void)(self);
const char *tag_name;
PyObject *text, *tail;
if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
const char *b = strrchr(tag_name, '}');
if (b) tag_name = b + 1;
char ltagname[16];
const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
for (size_t i = 0; i < tag_name_len; i++) {
if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
else ltagname[i] = tag_name[i];
}
int is_ignored_tag = 0;
static size_t
count_chars(const char *tag_name, Py_ssize_t tag_len, udata *text, udata *tail) {
size_t ans = 0;
#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) {
is_ignored_tag = 1;
} else if (EQ(img) || EQ(svg)) ans += 1000;
int is_ignored_tag = 0;
char ltagname[16];
if (tag_name) {
const char *b = memchr(tag_name, '}', tag_len);
if (b) {
b++;
tag_len -= b - tag_name;
tag_name = b;
}
if (tag_len < sizeof(ltagname)) {
memcpy(ltagname, tag_name, tag_len);
for (size_t i = 0; i < tag_len; i++) if ('A' <= ltagname[i] && ltagname[i] <= 'Z') ltagname[i] += 32;
#define EQ(x) (memcmp(ltagname, #x, tag_len) == 0)
switch(ltagname[0]) {
case 's':
if (EQ(script) || EQ(style)) is_ignored_tag = 1;
else if (EQ(svg)) ans += 1000;
break;
case 'n':
if (EQ(noscript)) is_ignored_tag = 1;
break;
case 't':
if (EQ(title)) is_ignored_tag = 1;
break;
case 'i':
if (EQ(img)) ans += 1000;
break;
}
}
}
#undef EQ
if (tail != Py_None) ans += count_chars_in(tail);
if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
ans += count_chars_in(tail);
if (!is_ignored_tag) ans += count_chars_in(text);
return ans;
}

static PyObject*
get_num_of_significant_chars(PyObject *self, PyObject *elem) {
(void)(self);
const char *tag_name = NULL;
Py_ssize_t tag_len = 0;
PyObject *ptn = PyObject_GetAttrString(elem, "tag"), *text = NULL;
if (ptn && PyUnicode_Check(ptn)) tag_name = PyUnicode_AsUTF8AndSize(ptn, &tag_len);
udata xdata = {0}, tdata = {0};
if (tag_name) {
text = PyObject_GetAttrString(elem, "text");
if (text && PyUnicode_Check(text)) {
xdata.len = PyUnicode_GET_LENGTH(text); xdata.kind = PyUnicode_KIND(text); xdata.data = PyUnicode_DATA(text);
}
}
PyObject *tail = PyObject_GetAttrString(elem, "tail");
if (tail && PyUnicode_Check(tail)) {
tdata.len = PyUnicode_GET_LENGTH(tail); tdata.kind = PyUnicode_KIND(tail); tdata.data = PyUnicode_DATA(tail);
}
size_t ans;
Py_BEGIN_ALLOW_THREADS
ans = count_chars(tag_name, tag_len, &xdata, &tdata);
Py_END_ALLOW_THREADS;
Py_XDECREF(ptn); Py_XDECREF(text); Py_XDECREF(tail);
return PyLong_FromSize_t(ans);
}

Expand Down Expand Up @@ -693,8 +729,8 @@ static PyMethodDef speedup_methods[] = {
"set_thread_name(name)\n\nWrapper for pthread_setname_np"
},

{"get_element_char_length", get_element_char_length, METH_VARARGS,
"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
{"get_num_of_significant_chars", get_num_of_significant_chars, METH_O,
"get_num_of_significant_chars(elem)\n\nGet the number of chars in specified tag"
},

{NULL, NULL, 0, NULL}
Expand Down

0 comments on commit 07037dd

Please sign in to comment.