Skip to content

Commit

Permalink
Enable correct whitespace handling in write_html() (#602)
Browse files Browse the repository at this point in the history
Fixes #547
  • Loading branch information
seanpmulholland authored Nov 18, 2022
1 parent 3884eba commit 3a8d446
Show file tree
Hide file tree
Showing 16 changed files with 123 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ contributors/contributors.html
include/
local/

# vscode
.vscode

# Packages
*.egg
.eggs
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
- the 1000+ unit tests suite is now executed under Linux **<ins>and</ins>** Windows, with extra timing & memory usage checks ensuring we control `fpdf2` resource usage
- New translation of the tutorial in [עברית](https://pyfpdf.github.io/fpdf2/Tutorial-he.html), thanks to @TzviGreenfeld
- New documentation for using [PyPDF2](https://github.com/py-pdf/PyPDF2) with `fpdf2`, added by @devdev29: https://pyfpdf.github.io/fpdf2/CombineWithPyPDF2.html
- the `write_html()` method now supports `<code></code>` blocks.
### Deprecated
- `HTMLMixin` is deprecated, and not needed anymore: **the `write_html()` method is now natively available in the `FPDF` class** - thanks to @yk-jp
### Removed
Expand All @@ -36,6 +37,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
- fixed [`insert_toc_placeholder()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.insert_toc_placeholder) usage with [`footer()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.footer) and `{{nb}}`; [#548](https://github.com/PyFPDF/fpdf2/issues/548)
- the SVG parser now accepts `stroke-width` attribute values with an explicit unit, thanks to @gmischler; [#526](https://github.com/PyFPDF/fpdf2/issues/526)
- the SVG parser now accepts absolute units for `width` and `height` attributes, thanks to @darioackermann; [#555](https://github.com/PyFPDF/fpdf2/issues/555)
- `write_html()` method now correctly handles whitespace when parsing HTML. `<pre></pre>` blocks still maintain spaces, tabs and line breaks.
### Changed
- the first parameter of `FPDF.add_font()` is now **optional**: if it is not provided, the base name of the `fname` font path is used to define the font family. Hence `pdf.add_font(fname="fonts/NotoSansArabic.ttf")` will define a font named `NotoSansArabic`.
- the output of [`embed_file()`](https://pyfpdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.embed_file) is now a `PDFEmbeddedFile`, not a string, but the internal file name can be retrieved through its `.basename` property
Expand Down
79 changes: 77 additions & 2 deletions fpdf/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@

from .enums import XPos, YPos

import re

LOGGER = logging.getLogger(__name__)
BULLET_WIN1252 = "\x95" # BULLET character in Windows-1252 encoding
DEFAULT_HEADING_SIZES = dict(h1=24, h2=18, h3=14, h4=12, h5=10, h6=8)
LEADING_SPACE = re.compile(r"^\s+")
WHITESPACE = re.compile(r"(\s)(\s*)")
TRAILING_SPACE = re.compile(r"\s$")

COLOR_DICT = {
"black": "#000000",
Expand Down Expand Up @@ -223,6 +227,9 @@ def __init__(
self.table_line_separators = table_line_separators
self.ul_bullet_char = ul_bullet_char
self.style = dict(b=False, i=False, u=False)
self.pre_formatted = False
self.follows_fmt_tag = False
self.follows_trailing_space = False
self.href = ""
self.align = ""
self.page_links = {}
Expand Down Expand Up @@ -263,6 +270,7 @@ def width2unit(self, length):
return int(length)

def handle_data(self, data):
trailing_space_flag = TRAILING_SPACE.search(data)
if self.td is not None: # drawing a table?
self._insert_td(data)
elif self.table is not None:
Expand All @@ -280,15 +288,43 @@ def handle_data(self, data):
align=self.align[0].upper(),
link=self.href,
)
elif self.pre_formatted: # for pre blocks
self.pdf.write(self.h, data)

elif self.follows_fmt_tag and not self.follows_trailing_space:
# don't trim leading whitespace if following a format tag with no trailing whitespace
data = WHITESPACE.sub(whitespace_repl, data)
if trailing_space_flag:
self.follows_trailing_space = True
if self.href:
self.put_link(data)
else:
if self.heading_level:
self.pdf.start_section(data, self.heading_level - 1)
LOGGER.debug(
"write '%s' h=%d",
WHITESPACE.sub(whitespace_repl, data),
self.h,
)
self.pdf.write(self.h, data)
self.follows_fmt_tag = False

else:
data = data.replace("\n", " ")
data = LEADING_SPACE.sub(leading_whitespace_repl, data)
data = WHITESPACE.sub(whitespace_repl, data)
self.follows_trailing_space = trailing_space_flag
if self.href:
self.put_link(data)
else:
if self.heading_level:
self.pdf.start_section(data, self.heading_level - 1)
LOGGER.debug("write '%s' h=%d", data.replace("\n", "\\n"), self.h)
LOGGER.debug(
"write '%s' h=%d",
WHITESPACE.sub(whitespace_repl, data),
self.h,
)
self.pdf.write(self.h, data)
self.follows_fmt_tag = False

def _insert_td(self, data=""):
self._only_imgs_in_td = False
Expand Down Expand Up @@ -472,9 +508,13 @@ def handle_starttag(self, tag, attrs):
self.align = attrs.get("align")
if tag == "hr":
self.pdf.add_page(same=True)
if tag == "code":
self.font_stack.append((self.font_face, self.font_size, self.font_color))
self.set_font("courier", 11)
if tag == "pre":
self.font_stack.append((self.font_face, self.font_size, self.font_color))
self.set_font("courier", 11)
self.pre_formatted = True
if tag == "blockquote":
self.pdf.set_text_color(100, 0, 45)
self.indent += 1
Expand Down Expand Up @@ -622,10 +662,15 @@ def handle_endtag(self, tag):
self.set_font(face, size)
self.set_text_color(*color)
self.align = None
if tag == "code":
face, size, color = self.font_stack.pop()
self.set_font(face, size)
self.set_text_color(*color)
if tag == "pre":
face, size, color = self.font_stack.pop()
self.set_font(face, size)
self.set_text_color(*color)
self.pre_formatted = False
if tag == "blockquote":
self.set_text_color(*self.font_color)
self.indent -= 1
Expand All @@ -636,6 +681,7 @@ def handle_endtag(self, tag):
tag = "i"
if tag in ("b", "i", "u"):
self.set_style(tag, False)
self.follows_fmt_tag = True
if tag == "a":
self.href = ""
if tag == "p":
Expand Down Expand Up @@ -686,8 +732,10 @@ def handle_endtag(self, tag):
self.align = None
if tag == "sup":
self.pdf.char_vpos = "LINE"
self.follows_fmt_tag = True
if tag == "sub":
self.pdf.char_vpos = "LINE"
self.follows_fmt_tag = True

def set_font(self, face=None, size=None):
if face:
Expand Down Expand Up @@ -743,6 +791,33 @@ def error(self, message):
raise RuntimeError(message)


def leading_whitespace_repl(matchobj):
trimmed_str = ""
for char in matchobj.group(0): # check if leading whitespace contains nbsp
if char == "\u00a0":
trimmed_str += "\u00a0"
elif char == "\u202f":
trimmed_str += "\u202f"
return trimmed_str


def whitespace_repl(matchobj):
trimmed_str = ""
for char in matchobj.group(
1
): # allow 1 whitespace char, check for narrow no-break space
if char == "\u202f":
trimmed_str += "\u202f"
else:
trimmed_str += " "
for char in matchobj.group(2): # remove following whitespace char unless nbsp
if char == "\u00a0":
trimmed_str += "\u00a0"
elif char == "\u202f":
trimmed_str += "\u202f"
return trimmed_str


class HTMLMixin:
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
Binary file modified test/html/html_bold_italic_underline.pdf
Binary file not shown.
Binary file modified test/html/html_custom_heading_sizes.pdf
Binary file not shown.
Binary file modified test/html/html_description.pdf
Binary file not shown.
Binary file modified test/html/html_headings_line_height.pdf
Binary file not shown.
Binary file modified test/html/test_customize_ul.pdf
Binary file not shown.
36 changes: 36 additions & 0 deletions test/html/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,3 +566,39 @@ class PDF(FPDF, HTMLMixin):
"""
)
assert_pdf_equal(pdf, HERE / "html_description.pdf", tmp_path)


def test_html_whitespace_handling(tmp_path): # Issue 547
"""Testing whitespace handling for write_html()."""
pdf = FPDF()
pdf.add_page()
pdf.write_html(
"""
<body>
<h1>Issue 547 Test</h1>
<p>
<b>Testing </b> paragraph blocks
that <i>span</i> <b>multiple lines</b>.
Testing tabs and spaces<br>
and break tags.<br>
</p>
<code>Testing code blocks with tabs and spaces.</code><br>
<pre>
Testing pre blocks
that span multiple lines
and have tabs and spaces.
</pre>
<pre><code>
Testing pre-code blocks
that span multiple lines
and have tabs and spaces.
</code></pre>
<p>Testing unicode nbsp \u00a0\u00a0\u00a0\u00a0,
and html nbsp &nbsp;&nbsp;&nbsp;&nbsp;.<br>
\u00a0&nbsp;&nbsp;Testing leading nbsp
</body>
"""
)
assert_pdf_equal(pdf, HERE / "test_html_whitespace_handling.pdf", tmp_path)
Binary file added test/html/test_html_whitespace_handling.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion test/image/test_oversized.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

HERE = Path(__file__).resolve().parent
IMAGE_PATH = HERE / "png_images/6c853ed9dacd5716bc54eb59cec30889.png"
MAX_MEMORY_MB = 132 # memory usage depends on Python version
MAX_MEMORY_MB = 135 # memory usage depends on Python version


def test_oversized_images_warn(caplog):
Expand Down
Binary file modified test/outline/custom_HTML2FPDF.pdf
Binary file not shown.
Binary file modified test/outline/html_toc.pdf
Binary file not shown.
Binary file modified test/outline/html_toc_2_pages.pdf
Binary file not shown.
Binary file modified test/outline/html_toc_with_h1_as_2nd_heading.pdf
Binary file not shown.
6 changes: 4 additions & 2 deletions test/outline/test_outline_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def test_html_toc(tmp_path):
pdf = FPDF()
pdf.add_page()
pdf.write_html(
"""<h1>Document title</h1>
"""
<h1>Document title</h1>
<br><br><br>
<u>Table of content:</u>
<br>
Expand All @@ -37,7 +38,8 @@ def test_html_toc(tmp_path):
<section><h3>Subtitle 2.2</h3><br>
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<section>
<section>"""
<section>
"""
)
assert_pdf_equal(pdf, HERE / "html_toc.pdf", tmp_path)

Expand Down

0 comments on commit 3a8d446

Please sign in to comment.