From a2ca9f0521c35495fe434a2c69252c627c491346 Mon Sep 17 00:00:00 2001 From: William Chargin Date: Fri, 17 Apr 2020 14:32:06 -0700 Subject: [PATCH 1/2] text: batch HTML sanitization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: By design, we only expose an API for converting Markdown to *safe* HTML. But clients who call this method many times end up performing HTML sanitization many times, which is expensive: about an order of magnitude more expensive than Markdown conversion itself. This patch introduces a new API that still only emits safe HTML but enables clients to combine multiple input documents with only one round of sanitization. Test Plan: The `/data/plugin/text/text` route sees 40–60% speedup: on my machine, - from 0.38 ± 0.04 seconds to 0.211 ± 0.005 seconds on the “higher order tensors” text demo downsampled to 10 steps; and - from 5.3 ± 0.9 seconds to 2.1 ± 0.2 seconds on a Google-internal dataset with 32 Markdown cells per step downsampled to 100 steps. wchargin-branch: text-batch-bleach --- tensorboard/plugin_util.py | 71 ++++++++++++++++++------- tensorboard/plugin_util_test.py | 18 +++++++ tensorboard/plugins/text/text_plugin.py | 12 ++--- 3 files changed, 76 insertions(+), 25 deletions(-) diff --git a/tensorboard/plugin_util.py b/tensorboard/plugin_util.py index 46b6292619..ad8c22d7d9 100644 --- a/tensorboard/plugin_util.py +++ b/tensorboard/plugin_util.py @@ -85,25 +85,60 @@ def markdown_to_safe_html(markdown_string): Returns: A string containing safe HTML. """ - warning = "" - # Convert to utf-8 whenever we have a binary input. - if isinstance(markdown_string, six.binary_type): - markdown_string_decoded = markdown_string.decode("utf-8") - # Remove null bytes and warn if there were any, since it probably means - # we were given a bad encoding. - markdown_string = markdown_string_decoded.replace(u"\x00", u"") - num_null_bytes = len(markdown_string_decoded) - len(markdown_string) - if num_null_bytes: - warning = ( - "\n" - ) % num_null_bytes - - string_html = _MARKDOWN_STORE.markdown.convert(markdown_string) - string_sanitized = bleach.clean( - string_html, tags=_ALLOWED_TAGS, attributes=_ALLOWED_ATTRIBUTES + return markdowns_to_safe_html([markdown_string], lambda xs: xs[0]) + + +def markdowns_to_safe_html(markdown_strings, combine): + """Convert multiple Markdown documents to one safe HTML document. + + One could also achieve this by calling `markdown_to_safe_html` + multiple times and combining the results. Compared to that approach, + this function may be faster, because HTML sanitization (which can be + expensive) is performed only once rather than once per input. It may + also be less precise: if one of the input documents has unsafe HTML + that is sanitized away, that sanitization might affect other + documents, even if those documents are safe. + + Args: + markdown_strings: List of Markdown source strings to convert, as + Unicode strings or UTF-8--encoded bytestrings. Markdown tables + are supported. + combine: Callback function that takes a list of unsafe HTML + strings of the same shape as `markdown_strings` and combines + them into a single unsafe HTML string, which will be sanitized + and returned. + + Returns: + A string containing safe HTML. + """ + unsafe_htmls = [] + total_null_bytes = 0 + import contextlib + + for source in markdown_strings: + # Convert to utf-8 whenever we have a binary input. + if isinstance(source, six.binary_type): + source_decoded = source.decode("utf-8") + # Remove null bytes and warn if there were any, since it probably means + # we were given a bad encoding. + source = source_decoded.replace(u"\x00", u"") + total_null_bytes += len(source_decoded) - len(source) + unsafe_html = _MARKDOWN_STORE.markdown.convert(source) + unsafe_htmls.append(unsafe_html) + + unsafe_combined = combine(unsafe_htmls) + sanitized_combined = bleach.clean( + unsafe_combined, tags=_ALLOWED_TAGS, attributes=_ALLOWED_ATTRIBUTES ) - return warning + string_sanitized + + warning = "" + if total_null_bytes: + warning = ( + "\n" + ) % total_null_bytes + + return warning + sanitized_combined def experiment_id(environ): diff --git a/tensorboard/plugin_util_test.py b/tensorboard/plugin_util_test.py index 777613a5df..c0fb67984e 100644 --- a/tensorboard/plugin_util_test.py +++ b/tensorboard/plugin_util_test.py @@ -140,6 +140,24 @@ def test_null_bytes_stripped_before_markdown_processing(self): ) +class MarkdownsToSafeHTMLTest(tb_test.TestCase): + # Most of the heavy lifting is tested by `MarkdownToSafeHTMLTest`. + + def test_simple(self): + inputs = ["0", "*1*", "**2**"] + combine = lambda xs: "
".join(xs) + actual = plugin_util.markdowns_to_safe_html(inputs, combine) + expected = "

0


1


2

" + self.assertEqual(actual, expected) + + def test_sanitizes_combination_result(self): + inputs = ["safe"] + combine = lambda xs: "%s" % xs[0] + actual = plugin_util.markdowns_to_safe_html(inputs, combine) + expected = "<script>alert('unsafe!')</script>

safe

" + self.assertEqual(actual, expected) + + class ExperimentIdTest(tb_test.TestCase): """Tests for `plugin_util.experiment_id`.""" diff --git a/tensorboard/plugins/text/text_plugin.py b/tensorboard/plugins/text/text_plugin.py index 6ff564973b..67774d3559 100644 --- a/tensorboard/plugins/text/text_plugin.py +++ b/tensorboard/plugins/text/text_plugin.py @@ -183,13 +183,11 @@ def text_array_to_html(text_arr): WARNING_TEMPLATE % len(text_arr.shape) ) text_arr = reduce_to_2d(text_arr) - - html_arr = [ - plugin_util.markdown_to_safe_html(x) for x in text_arr.reshape(-1) - ] - html_arr = np.array(html_arr).reshape(text_arr.shape) - - return warning + make_table(html_arr) + table = plugin_util.markdowns_to_safe_html( + text_arr.reshape(-1), + lambda xs: make_table(np.array(xs).reshape(text_arr.shape)), + ) + return warning + table def process_event(wall_time, step, string_ndarray): From 94f75c23d9935315e119b516adee9968b7e9a66d Mon Sep 17 00:00:00 2001 From: William Chargin Date: Fri, 17 Apr 2020 16:31:31 -0700 Subject: [PATCH 2/2] [update patch] wchargin-branch: text-batch-bleach wchargin-source: 98c30df804d74250f6507991ba93a83bbb699ce6 --- tensorboard/plugin_util.py | 1 - tensorboard/plugin_util_test.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorboard/plugin_util.py b/tensorboard/plugin_util.py index ad8c22d7d9..edb33f9696 100644 --- a/tensorboard/plugin_util.py +++ b/tensorboard/plugin_util.py @@ -113,7 +113,6 @@ def markdowns_to_safe_html(markdown_strings, combine): """ unsafe_htmls = [] total_null_bytes = 0 - import contextlib for source in markdown_strings: # Convert to utf-8 whenever we have a binary input. diff --git a/tensorboard/plugin_util_test.py b/tensorboard/plugin_util_test.py index c0fb67984e..c753bceb37 100644 --- a/tensorboard/plugin_util_test.py +++ b/tensorboard/plugin_util_test.py @@ -157,6 +157,13 @@ def test_sanitizes_combination_result(self): expected = "<script>alert('unsafe!')</script>

safe

" self.assertEqual(actual, expected) + def test_sanitization_can_have_collateral_damage(self): + inputs = ['">'] + combine = lambda xs: "".join(xs) + actual = plugin_util.markdowns_to_safe_html(inputs, combine) + expected = "
" + self.assertEqual(actual, expected) + class ExperimentIdTest(tb_test.TestCase): """Tests for `plugin_util.experiment_id`."""