Speedup counting of chars in tags

kovidgoyal · Sep 18, 2024 · 07037dd · 07037dd
1 parent df6e586
commit 07037dd
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 43 deletions.
diff --git a/src/calibre/srv/render_book.py b/src/calibre/srv/render_book.py
@@ -34,13 +34,17 @@
 from calibre.utils.serialize import json_dumps, json_loads, msgpack_dumps, msgpack_loads
 from calibre.utils.short_uuid import uuid4
 from calibre_extensions.fast_css_transform import transform_properties
-from calibre_extensions.speedup import get_element_char_length
 from polyglot.binary import as_base64_unicode as encode_component
 from polyglot.binary import from_base64_bytes
 from polyglot.binary import from_base64_unicode as decode_component
 from polyglot.builtins import as_bytes, iteritems
 from polyglot.urllib import quote, urlparse
 
+try:
+    from calibre_extensions.speedup import get_num_of_significant_chars
+except ImportError:  # running from source without updated binary
+    def get_num_of_significant_chars(elem):
+        return len(getattr(elem, 'text', '')) + len(getattr(elem, 'tail', ''))
 RENDER_VERSION = 1
 
 BLANK_JPEG = b'\xff\xd8\xff\xdb\x00C\x00\x03\x02\x02\x02\x02\x02\x03\x02\x02\x02\x03\x03\x03\x03\x04\x06\x04\x04\x04\x04\x04\x08\x06\x06\x05\x06\t\x08\n\n\t\x08\t\t\n\x0c\x0f\x0c\n\x0b\x0e\x0b\t\t\r\x11\r\x0e\x0f\x10\x10\x11\x10\n\x0c\x12\x13\x12\x10\x13\x0f\x10\x10\x10\xff\xc9\x00\x0b\x08\x00\x01\x00\x01\x01\x01\x11\x00\xff\xcc\x00\x06\x00\x10\x10\x05\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xd2\xcf \xff\xd9'  # noqa
@@ -142,17 +146,10 @@ def anchor_map(root):
 
 def get_length(root):
     ans = 0
-
-    def count(elem):
-        tag = getattr(elem, 'tag', count)
-        if callable(tag):
-            return get_element_char_length('', None, getattr(elem, 'tail', None))
-        return get_element_char_length(tag, elem.text, elem.tail)
-
     for body in root.iterchildren(XHTML('body')):
-        ans += count(body)
+        ans += get_num_of_significant_chars(body)
         for elem in body.iterdescendants():
-            ans += count(elem)
+            ans += get_num_of_significant_chars(elem)
     return ans
 
 

diff --git a/src/calibre/srv/tests/fast_css_transform.py b/src/calibre/srv/tests/fast_css_transform.py
@@ -10,6 +10,20 @@
 
 class TestTransform(SimpleTest):
 
+    def test_counting_chars_in_elems(self):
+        from lxml import etree
+
+        from calibre.ebooks.oeb.polish.parsing import parse
+        from calibre.srv.render_book import get_length
+        def t(html, expected):
+            root = parse(html, force_html5_parse=True)
+            self.assertEqual(expected, get_length(root), etree.tostring(root, encoding=str))
+        t('<p>abc<span>def</span>x yz<svg>howdy', 1014)
+        t('<p>abc<span>def</span>x yz', 9)
+        t('<p>abc<span>def</span><script>x yz', 6)
+        t('<p>abc<span>def</span><style>x yz', 6)
+        t('<p>abc<span>def</span>x yz<img>howdy', 1014)
+
     def test_number_parsing(self):
         for x in '.314 -.314 0.314 0 2 +2 -1 1e2 -3.14E+2 2e-2'.split():
             self.ae(parse_css_number(x), ast.literal_eval(x))

diff --git a/src/calibre/utils/speedup.c b/src/calibre/utils/speedup.c
@@ -489,43 +489,79 @@ set_thread_name(PyObject *self, PyObject *args) {
 
 #define char_is_ignored(ch) (ch <= 32)
 
+typedef struct udata {
+    void *data; int kind; Py_ssize_t len;
+} udata;
+
 static size_t
-count_chars_in(PyObject *text) {
-	size_t ans = 0;
-	if (PyUnicode_READY(text) != 0) return 0;
-	int kind = PyUnicode_KIND(text);
-	void *data = PyUnicode_DATA(text);
-	Py_ssize_t len = PyUnicode_GET_LENGTH(text);
-	ans = len;
-	for (Py_ssize_t i = 0; i < len; i++) {
-		if (char_is_ignored(PyUnicode_READ(kind, data, i))) ans--;
-	}
+count_chars_in(udata *text) {
+	size_t ans = text->len;
+	for (Py_ssize_t i = 0; i < text->len; i++) if (char_is_ignored(PyUnicode_READ(text->kind, text->data, i))) ans--;
 	return ans;
 }
 
-static PyObject*
-get_element_char_length(PyObject *self, PyObject *args) {
-	(void)(self);
-	const char *tag_name;
-	PyObject *text, *tail;
-	if (!PyArg_ParseTuple(args, "sOO", &tag_name, &text, &tail)) return NULL;
-	const char *b = strrchr(tag_name, '}');
-	if (b) tag_name = b + 1;
-	char ltagname[16];
-	const size_t tag_name_len = strnlen(tag_name, sizeof(ltagname)-1);
-	for (size_t i = 0; i < tag_name_len; i++) {
-		if ('A' <= tag_name[i] && tag_name[i] <= 'Z') ltagname[i] = 32 + tag_name[i];
-		else ltagname[i] = tag_name[i];
-	}
-	int is_ignored_tag = 0;
+static size_t
+count_chars(const char *tag_name, Py_ssize_t tag_len, udata *text, udata *tail) {
 	size_t ans = 0;
-#define EQ(x) memcmp(ltagname, #x, sizeof(#x) - 1) == 0
-	if (EQ(script) || EQ(noscript) || EQ(style) || EQ(title)) {
-        is_ignored_tag = 1;
-    } else if (EQ(img) || EQ(svg)) ans += 1000;
+    int is_ignored_tag = 0;
+    char ltagname[16];
+    if (tag_name) {
+        const char *b = memchr(tag_name, '}', tag_len);
+        if (b) {
+            b++;
+            tag_len -= b - tag_name;
+            tag_name = b;
+        }
+        if (tag_len < sizeof(ltagname)) {
+            memcpy(ltagname, tag_name, tag_len);
+            for (size_t i = 0; i < tag_len; i++) if ('A' <= ltagname[i] && ltagname[i] <= 'Z') ltagname[i] += 32;
+#define EQ(x) (memcmp(ltagname, #x, tag_len) == 0)
+            switch(ltagname[0]) {
+                case 's':
+                    if (EQ(script) || EQ(style)) is_ignored_tag = 1;
+                    else if (EQ(svg)) ans += 1000;
+                    break;
+                case 'n':
+                    if (EQ(noscript)) is_ignored_tag = 1;
+                    break;
+                case 't':
+                    if (EQ(title)) is_ignored_tag = 1;
+                    break;
+                case 'i':
+                    if (EQ(img)) ans += 1000;
+                    break;
+            }
+        }
+    }
 #undef EQ
-	if (tail != Py_None) ans += count_chars_in(tail);
-	if (text != Py_None && !is_ignored_tag) ans += count_chars_in(text);
+	ans += count_chars_in(tail);
+	if (!is_ignored_tag) ans += count_chars_in(text);
+    return ans;
+}
+
+static PyObject*
+get_num_of_significant_chars(PyObject *self, PyObject *elem) {
+	(void)(self);
+	const char *tag_name = NULL;
+    Py_ssize_t tag_len = 0;
+    PyObject *ptn = PyObject_GetAttrString(elem, "tag"), *text = NULL;
+    if (ptn && PyUnicode_Check(ptn)) tag_name = PyUnicode_AsUTF8AndSize(ptn, &tag_len);
+    udata xdata = {0}, tdata = {0};
+    if (tag_name) {
+        text = PyObject_GetAttrString(elem, "text");
+        if (text && PyUnicode_Check(text)) {
+            xdata.len = PyUnicode_GET_LENGTH(text); xdata.kind = PyUnicode_KIND(text); xdata.data = PyUnicode_DATA(text);
+        }
+    }
+    PyObject *tail = PyObject_GetAttrString(elem, "tail");
+    if (tail && PyUnicode_Check(tail)) {
+        tdata.len = PyUnicode_GET_LENGTH(tail); tdata.kind = PyUnicode_KIND(tail); tdata.data = PyUnicode_DATA(tail);
+    }
+    size_t ans;
+    Py_BEGIN_ALLOW_THREADS
+        ans = count_chars(tag_name, tag_len, &xdata, &tdata);
+    Py_END_ALLOW_THREADS;
+    Py_XDECREF(ptn); Py_XDECREF(text); Py_XDECREF(tail);
 	return PyLong_FromSize_t(ans);
 }
 
@@ -693,8 +729,8 @@ static PyMethodDef speedup_methods[] = {
 		"set_thread_name(name)\n\nWrapper for pthread_setname_np"
 	},
 
-	{"get_element_char_length", get_element_char_length, METH_VARARGS,
-		"get_element_char_length(tag_name, text, tail)\n\nGet the number of chars in specified tag"
+	{"get_num_of_significant_chars", get_num_of_significant_chars, METH_O,
+		"get_num_of_significant_chars(elem)\n\nGet the number of chars in specified tag"
 	},
 
     {NULL, NULL, 0, NULL}