From a2ca9f0521c35495fe434a2c69252c627c491346 Mon Sep 17 00:00:00 2001
From: William Chargin <wchargin@gmail.com>
Date: Fri, 17 Apr 2020 14:32:06 -0700
Subject: [PATCH 1/2] text: batch HTML sanitization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
By design, we only expose an API for converting Markdown to *safe* HTML.
But clients who call this method many times end up performing HTML
sanitization many times, which is expensive: about an order of magnitude
more expensive than Markdown conversion itself. This patch introduces a
new API that still only emits safe HTML but enables clients to combine
multiple input documents with only one round of sanitization.

Test Plan:
The `/data/plugin/text/text` route sees 40–60% speedup: on my machine,

  - from 0.38 ± 0.04 seconds to 0.211 ± 0.005 seconds on the “higher
    order tensors” text demo downsampled to 10 steps; and
  - from 5.3 ± 0.9 seconds to 2.1 ± 0.2 seconds on a Google-internal
    dataset with 32 Markdown cells per step downsampled to 100 steps.

wchargin-branch: text-batch-bleach
---
 tensorboard/plugin_util.py              | 71 ++++++++++++++++++-------
 tensorboard/plugin_util_test.py         | 18 +++++++
 tensorboard/plugins/text/text_plugin.py | 12 ++---
 3 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/tensorboard/plugin_util.py b/tensorboard/plugin_util.py
index 46b6292619..ad8c22d7d9 100644
--- a/tensorboard/plugin_util.py
+++ b/tensorboard/plugin_util.py
@@ -85,25 +85,60 @@ def markdown_to_safe_html(markdown_string):
     Returns:
       A string containing safe HTML.
     """
-    warning = ""
-    # Convert to utf-8 whenever we have a binary input.
-    if isinstance(markdown_string, six.binary_type):
-        markdown_string_decoded = markdown_string.decode("utf-8")
-        # Remove null bytes and warn if there were any, since it probably means
-        # we were given a bad encoding.
-        markdown_string = markdown_string_decoded.replace(u"\x00", u"")
-        num_null_bytes = len(markdown_string_decoded) - len(markdown_string)
-        if num_null_bytes:
-            warning = (
-                "<!-- WARNING: discarded %d null bytes in markdown string "
-                "after UTF-8 decoding -->\n"
-            ) % num_null_bytes
-
-    string_html = _MARKDOWN_STORE.markdown.convert(markdown_string)
-    string_sanitized = bleach.clean(
-        string_html, tags=_ALLOWED_TAGS, attributes=_ALLOWED_ATTRIBUTES
+    return markdowns_to_safe_html([markdown_string], lambda xs: xs[0])
+
+
+def markdowns_to_safe_html(markdown_strings, combine):
+    """Convert multiple Markdown documents to one safe HTML document.
+
+    One could also achieve this by calling `markdown_to_safe_html`
+    multiple times and combining the results. Compared to that approach,
+    this function may be faster, because HTML sanitization (which can be
+    expensive) is performed only once rather than once per input. It may
+    also be less precise: if one of the input documents has unsafe HTML
+    that is sanitized away, that sanitization might affect other
+    documents, even if those documents are safe.
+
+    Args:
+      markdown_strings: List of Markdown source strings to convert, as
+        Unicode strings or UTF-8--encoded bytestrings. Markdown tables
+        are supported.
+      combine: Callback function that takes a list of unsafe HTML
+        strings of the same shape as `markdown_strings` and combines
+        them into a single unsafe HTML string, which will be sanitized
+        and returned.
+
+    Returns:
+      A string containing safe HTML.
+    """
+    unsafe_htmls = []
+    total_null_bytes = 0
+    import contextlib
+
+    for source in markdown_strings:
+        # Convert to utf-8 whenever we have a binary input.
+        if isinstance(source, six.binary_type):
+            source_decoded = source.decode("utf-8")
+            # Remove null bytes and warn if there were any, since it probably means
+            # we were given a bad encoding.
+            source = source_decoded.replace(u"\x00", u"")
+            total_null_bytes += len(source_decoded) - len(source)
+        unsafe_html = _MARKDOWN_STORE.markdown.convert(source)
+        unsafe_htmls.append(unsafe_html)
+
+    unsafe_combined = combine(unsafe_htmls)
+    sanitized_combined = bleach.clean(
+        unsafe_combined, tags=_ALLOWED_TAGS, attributes=_ALLOWED_ATTRIBUTES
     )
-    return warning + string_sanitized
+
+    warning = ""
+    if total_null_bytes:
+        warning = (
+            "<!-- WARNING: discarded %d null bytes in markdown string "
+            "after UTF-8 decoding -->\n"
+        ) % total_null_bytes
+
+    return warning + sanitized_combined
 
 
 def experiment_id(environ):
diff --git a/tensorboard/plugin_util_test.py b/tensorboard/plugin_util_test.py
index 777613a5df..c0fb67984e 100644
--- a/tensorboard/plugin_util_test.py
+++ b/tensorboard/plugin_util_test.py
@@ -140,6 +140,24 @@ def test_null_bytes_stripped_before_markdown_processing(self):
         )
 
 
+class MarkdownsToSafeHTMLTest(tb_test.TestCase):
+    # Most of the heavy lifting is tested by `MarkdownToSafeHTMLTest`.
+
+    def test_simple(self):
+        inputs = ["0", "*1*", "**2**"]
+        combine = lambda xs: "<br>".join(xs)
+        actual = plugin_util.markdowns_to_safe_html(inputs, combine)
+        expected = "<p>0</p><br><p><em>1</em></p><br><p><strong>2</strong></p>"
+        self.assertEqual(actual, expected)
+
+    def test_sanitizes_combination_result(self):
+        inputs = ["safe"]
+        combine = lambda xs: "<script>alert('unsafe!')</script>%s" % xs[0]
+        actual = plugin_util.markdowns_to_safe_html(inputs, combine)
+        expected = "&lt;script&gt;alert('unsafe!')&lt;/script&gt;<p>safe</p>"
+        self.assertEqual(actual, expected)
+
+
 class ExperimentIdTest(tb_test.TestCase):
     """Tests for `plugin_util.experiment_id`."""
 
diff --git a/tensorboard/plugins/text/text_plugin.py b/tensorboard/plugins/text/text_plugin.py
index 6ff564973b..67774d3559 100644
--- a/tensorboard/plugins/text/text_plugin.py
+++ b/tensorboard/plugins/text/text_plugin.py
@@ -183,13 +183,11 @@ def text_array_to_html(text_arr):
             WARNING_TEMPLATE % len(text_arr.shape)
         )
         text_arr = reduce_to_2d(text_arr)
-
-    html_arr = [
-        plugin_util.markdown_to_safe_html(x) for x in text_arr.reshape(-1)
-    ]
-    html_arr = np.array(html_arr).reshape(text_arr.shape)
-
-    return warning + make_table(html_arr)
+    table = plugin_util.markdowns_to_safe_html(
+        text_arr.reshape(-1),
+        lambda xs: make_table(np.array(xs).reshape(text_arr.shape)),
+    )
+    return warning + table
 
 
 def process_event(wall_time, step, string_ndarray):

From 94f75c23d9935315e119b516adee9968b7e9a66d Mon Sep 17 00:00:00 2001
From: William Chargin <wchargin@gmail.com>
Date: Fri, 17 Apr 2020 16:31:31 -0700
Subject: [PATCH 2/2] [update patch]

wchargin-branch: text-batch-bleach
wchargin-source: 98c30df804d74250f6507991ba93a83bbb699ce6
---
 tensorboard/plugin_util.py      | 1 -
 tensorboard/plugin_util_test.py | 7 +++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorboard/plugin_util.py b/tensorboard/plugin_util.py
index ad8c22d7d9..edb33f9696 100644
--- a/tensorboard/plugin_util.py
+++ b/tensorboard/plugin_util.py
@@ -113,7 +113,6 @@ def markdowns_to_safe_html(markdown_strings, combine):
     """
     unsafe_htmls = []
     total_null_bytes = 0
-    import contextlib
 
     for source in markdown_strings:
         # Convert to utf-8 whenever we have a binary input.
diff --git a/tensorboard/plugin_util_test.py b/tensorboard/plugin_util_test.py
index c0fb67984e..c753bceb37 100644
--- a/tensorboard/plugin_util_test.py
+++ b/tensorboard/plugin_util_test.py
@@ -157,6 +157,13 @@ def test_sanitizes_combination_result(self):
         expected = "&lt;script&gt;alert('unsafe!')&lt;/script&gt;<p>safe</p>"
         self.assertEqual(actual, expected)
 
+    def test_sanitization_can_have_collateral_damage(self):
+        inputs = ['<table title="*chuckles* ', "I'm in danger", '<table>">']
+        combine = lambda xs: "".join(xs)
+        actual = plugin_util.markdowns_to_safe_html(inputs, combine)
+        expected = "<table></table>"
+        self.assertEqual(actual, expected)
+
 
 class ExperimentIdTest(tb_test.TestCase):
     """Tests for `plugin_util.experiment_id`."""