Enable correct whitespace handling in write_html() (#602)

Fixes #547
py-pdf · Nov 18, 2022 · 3a8d446 · 3a8d446
1 parent 3884eba
commit 3a8d446
Show file tree

Hide file tree

Showing 16 changed files with 123 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,9 @@ contributors/contributors.html
 include/
 local/
 
+# vscode
+.vscode
+
 # Packages
 *.egg
 .eggs

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 - the 1000+ unit tests suite is now executed under Linux **<ins>and</ins>** Windows, with extra timing & memory usage checks ensuring we control `fpdf2` resource usage
 - New translation of the tutorial in [עברית](https://pyfpdf.github.io/fpdf2/Tutorial-he.html), thanks to @TzviGreenfeld
 - New documentation for using [PyPDF2](https://github.com/py-pdf/PyPDF2) with `fpdf2`, added by @devdev29: https://pyfpdf.github.io/fpdf2/CombineWithPyPDF2.html
+- the `write_html()` method now supports `<code></code>` blocks.
 ### Deprecated
 - `HTMLMixin` is deprecated, and not needed anymore: **the `write_html()` method is now natively available in the `FPDF` class** - thanks to @yk-jp
 ### Removed
@@ -36,6 +37,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 - fixed [`insert_toc_placeholder()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.insert_toc_placeholder) usage with [`footer()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.footer) and `{{nb}}`; [#548](https://github.com/PyFPDF/fpdf2/issues/548)
 - the SVG parser now accepts `stroke-width` attribute values with an explicit unit, thanks to @gmischler; [#526](https://github.com/PyFPDF/fpdf2/issues/526)
 - the SVG parser now accepts absolute units for `width` and `height` attributes, thanks to @darioackermann; [#555](https://github.com/PyFPDF/fpdf2/issues/555)
+- `write_html()` method now correctly handles whitespace when parsing HTML. `<pre></pre>` blocks still maintain spaces, tabs and line breaks. 
 ### Changed
 - the first parameter of `FPDF.add_font()` is now **optional**: if it is not provided, the base name of the `fname` font path is used to define the font family. Hence `pdf.add_font(fname="fonts/NotoSansArabic.ttf")` will define a font named `NotoSansArabic`.
 - the output of [`embed_file()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.embed_file) is now a `PDFEmbeddedFile`, not a string, but the internal file name can be retrieved through its `.basename` property

diff --git a/fpdf/html.py b/fpdf/html.py
@@ -11,10 +11,14 @@
 
 from .enums import XPos, YPos
 
+import re
 
 LOGGER = logging.getLogger(__name__)
 BULLET_WIN1252 = "\x95"  # BULLET character in Windows-1252 encoding
 DEFAULT_HEADING_SIZES = dict(h1=24, h2=18, h3=14, h4=12, h5=10, h6=8)
+LEADING_SPACE = re.compile(r"^\s+")
+WHITESPACE = re.compile(r"(\s)(\s*)")
+TRAILING_SPACE = re.compile(r"\s$")
 
 COLOR_DICT = {
     "black": "#000000",
@@ -223,6 +227,9 @@ def __init__(
         self.table_line_separators = table_line_separators
         self.ul_bullet_char = ul_bullet_char
         self.style = dict(b=False, i=False, u=False)
+        self.pre_formatted = False
+        self.follows_fmt_tag = False
+        self.follows_trailing_space = False
         self.href = ""
         self.align = ""
         self.page_links = {}
@@ -263,6 +270,7 @@ def width2unit(self, length):
         return int(length)
 
     def handle_data(self, data):
+        trailing_space_flag = TRAILING_SPACE.search(data)
         if self.td is not None:  # drawing a table?
             self._insert_td(data)
         elif self.table is not None:
@@ -280,15 +288,43 @@ def handle_data(self, data):
                 align=self.align[0].upper(),
                 link=self.href,
             )
+        elif self.pre_formatted:  # for pre blocks
+            self.pdf.write(self.h, data)
+
+        elif self.follows_fmt_tag and not self.follows_trailing_space:
+            # don't trim leading whitespace if following a format tag with no trailing whitespace
+            data = WHITESPACE.sub(whitespace_repl, data)
+            if trailing_space_flag:
+                self.follows_trailing_space = True
+            if self.href:
+                self.put_link(data)
+            else:
+                if self.heading_level:
+                    self.pdf.start_section(data, self.heading_level - 1)
+                LOGGER.debug(
+                    "write '%s' h=%d",
+                    WHITESPACE.sub(whitespace_repl, data),
+                    self.h,
+                )
+                self.pdf.write(self.h, data)
+            self.follows_fmt_tag = False
+
         else:
-            data = data.replace("\n", " ")
+            data = LEADING_SPACE.sub(leading_whitespace_repl, data)
+            data = WHITESPACE.sub(whitespace_repl, data)
+            self.follows_trailing_space = trailing_space_flag
             if self.href:
                 self.put_link(data)
             else:
                 if self.heading_level:
                     self.pdf.start_section(data, self.heading_level - 1)
-                LOGGER.debug("write '%s' h=%d", data.replace("\n", "\\n"), self.h)
+                LOGGER.debug(
+                    "write '%s' h=%d",
+                    WHITESPACE.sub(whitespace_repl, data),
+                    self.h,
+                )
                 self.pdf.write(self.h, data)
+            self.follows_fmt_tag = False
 
     def _insert_td(self, data=""):
         self._only_imgs_in_td = False
@@ -472,9 +508,13 @@ def handle_starttag(self, tag, attrs):
                 self.align = attrs.get("align")
         if tag == "hr":
             self.pdf.add_page(same=True)
+        if tag == "code":
+            self.font_stack.append((self.font_face, self.font_size, self.font_color))
+            self.set_font("courier", 11)
         if tag == "pre":
             self.font_stack.append((self.font_face, self.font_size, self.font_color))
             self.set_font("courier", 11)
+            self.pre_formatted = True
         if tag == "blockquote":
             self.pdf.set_text_color(100, 0, 45)
             self.indent += 1
@@ -622,10 +662,15 @@ def handle_endtag(self, tag):
             self.set_font(face, size)
             self.set_text_color(*color)
             self.align = None
+        if tag == "code":
+            face, size, color = self.font_stack.pop()
+            self.set_font(face, size)
+            self.set_text_color(*color)
         if tag == "pre":
             face, size, color = self.font_stack.pop()
             self.set_font(face, size)
             self.set_text_color(*color)
+            self.pre_formatted = False
         if tag == "blockquote":
             self.set_text_color(*self.font_color)
             self.indent -= 1
@@ -636,6 +681,7 @@ def handle_endtag(self, tag):
             tag = "i"
         if tag in ("b", "i", "u"):
             self.set_style(tag, False)
+            self.follows_fmt_tag = True
         if tag == "a":
             self.href = ""
         if tag == "p":
@@ -686,8 +732,10 @@ def handle_endtag(self, tag):
             self.align = None
         if tag == "sup":
             self.pdf.char_vpos = "LINE"
+            self.follows_fmt_tag = True
         if tag == "sub":
             self.pdf.char_vpos = "LINE"
+            self.follows_fmt_tag = True
 
     def set_font(self, face=None, size=None):
         if face:
@@ -743,6 +791,33 @@ def error(self, message):
         raise RuntimeError(message)
 
 
+def leading_whitespace_repl(matchobj):
+    trimmed_str = ""
+    for char in matchobj.group(0):  # check if leading whitespace contains nbsp
+        if char == "\u00a0":
+            trimmed_str += "\u00a0"
+        elif char == "\u202f":
+            trimmed_str += "\u202f"
+    return trimmed_str
+
+
+def whitespace_repl(matchobj):
+    trimmed_str = ""
+    for char in matchobj.group(
+        1
+    ):  # allow 1 whitespace char, check for narrow no-break space
+        if char == "\u202f":
+            trimmed_str += "\u202f"
+        else:
+            trimmed_str += " "
+    for char in matchobj.group(2):  # remove following whitespace char unless nbsp
+        if char == "\u00a0":
+            trimmed_str += "\u00a0"
+        elif char == "\u202f":
+            trimmed_str += "\u202f"
+    return trimmed_str
+
+
 class HTMLMixin:
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

diff --git a/test/html/html_bold_italic_underline.pdf b/test/html/html_bold_italic_underline.pdf
diff --git a/test/html/html_custom_heading_sizes.pdf b/test/html/html_custom_heading_sizes.pdf
diff --git a/test/html/html_description.pdf b/test/html/html_description.pdf
diff --git a/test/html/html_headings_line_height.pdf b/test/html/html_headings_line_height.pdf
diff --git a/test/html/test_customize_ul.pdf b/test/html/test_customize_ul.pdf
diff --git a/test/html/test_html.py b/test/html/test_html.py
@@ -566,3 +566,39 @@ class PDF(FPDF, HTMLMixin):
         """
         )
         assert_pdf_equal(pdf, HERE / "html_description.pdf", tmp_path)
+
+
+def test_html_whitespace_handling(tmp_path):  # Issue 547
+    """Testing whitespace handling for write_html()."""
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.write_html(
+        """
+<body>
+<h1>Issue 547 Test</h1>
+<p>
+<b>Testing   </b> paragraph blocks
+        that <i>span</i> <b>multiple lines</b>.
+    Testing tabs       and    spaces<br>
+    and break tags.<br>
+</p>
+<code>Testing code blocks with tabs      and    spaces.</code><br>
+<pre>
+Testing pre blocks
+that span multiple lines
+and have tabs    and    spaces.
+</pre>
+
+<pre><code>
+Testing pre-code blocks
+that span multiple lines
+and have tabs    and    spaces.
+</code></pre>
+
+<p>Testing unicode nbsp \u00a0\u00a0\u00a0\u00a0,
+and html nbsp &nbsp;&nbsp;&nbsp;&nbsp;.<br>
+    \u00a0&nbsp;&nbsp;Testing leading nbsp
+</body>
+"""
+    )
+    assert_pdf_equal(pdf, HERE / "test_html_whitespace_handling.pdf", tmp_path)
diff --git a/test/html/test_html_whitespace_handling.pdf b/test/html/test_html_whitespace_handling.pdf
diff --git a/test/image/test_oversized.py b/test/image/test_oversized.py
@@ -9,7 +9,7 @@
 
 HERE = Path(__file__).resolve().parent
 IMAGE_PATH = HERE / "png_images/6c853ed9dacd5716bc54eb59cec30889.png"
-MAX_MEMORY_MB = 132  # memory usage depends on Python version
+MAX_MEMORY_MB = 135  # memory usage depends on Python version
 
 
 def test_oversized_images_warn(caplog):

diff --git a/test/outline/custom_HTML2FPDF.pdf b/test/outline/custom_HTML2FPDF.pdf
diff --git a/test/outline/html_toc.pdf b/test/outline/html_toc.pdf
diff --git a/test/outline/html_toc_2_pages.pdf b/test/outline/html_toc_2_pages.pdf
diff --git a/test/outline/html_toc_with_h1_as_2nd_heading.pdf b/test/outline/html_toc_with_h1_as_2nd_heading.pdf
diff --git a/test/outline/test_outline_html.py b/test/outline/test_outline_html.py
@@ -12,7 +12,8 @@ def test_html_toc(tmp_path):
     pdf = FPDF()
     pdf.add_page()
     pdf.write_html(
-        """<h1>Document title</h1>
+        """
+        <h1>Document title</h1>
         <br><br><br>
         <u>Table of content:</u>
         <br>
@@ -37,7 +38,8 @@ def test_html_toc(tmp_path):
             <section><h3>Subtitle 2.2</h3><br>
             Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
             <section>
-        <section>"""
+        <section>
+        """
     )
     assert_pdf_equal(pdf, HERE / "html_toc.pdf", tmp_path)
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,6 +23,9 @@ contributors/contributors.html @@
     include/
     local/
+    # vscode
+    .vscode
     # Packages
     *.egg
     .eggs
@@ Expand Down @@