From 89f452f3fcd87177dd2cb5e3506546060c251a0e Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Mon, 4 Nov 2024 17:28:21 +0100 Subject: [PATCH 1/2] Restructure and improve dialogue highlighting --- novelwriter/text/patterns.py | 76 ++++++++++++++++----------- tests/test_text/test_text_patterns.py | 18 +++++-- 2 files changed, 58 insertions(+), 36 deletions(-) diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 7fd916dad..1f7948e8f 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -114,15 +114,20 @@ def altDialogStyle(self) -> re.Pattern | None: class DialogParser: - __slots__ = ("_quotes", "_dialog", "_narrator", "_alternate", "_break", "_enabled") + __slots__ = ( + "_quotes", "_dialog", "_alternate", "_enabled", + "_narrator", "_breakD", "_breakQ", "_mode", + ) def __init__(self) -> None: self._quotes = None self._dialog = "" - self._narrator = "" self._alternate = "" - self._break = re.compile("") self._enabled = False + self._narrator = "" + self._breakD = None + self._breakQ = None + self._mode = "" return @property @@ -131,21 +136,31 @@ def enabled(self) -> bool: return self._enabled def initParser(self) -> None: - """Init parser settings. Must be called when config changes.""" - punct = re.escape("!?.,:;") + """Init parser settings. This method must also be called when + the config changes. + """ self._quotes = REGEX_PATTERNS.dialogStyle self._dialog = uniqueCompact(CONFIG.dialogLine) - self._narrator = CONFIG.narratorBreak.strip()[:1] self._alternate = CONFIG.narratorDialog.strip()[:1] - self._break = re.compile( - f"({self._narrator}\\s?.*?)(\\s?(?:{self._narrator}[{punct}]?|$))", re.UNICODE - ) - self._enabled = bool(self._quotes or self._dialog or self._narrator or self._alternate) + + # One of the three modes are needed for the class to have + # anything to do + self._enabled = bool(self._quotes or self._dialog or self._alternate) + + # Build narrator break RegExes + if narrator := CONFIG.narratorBreak.strip()[:1]: + punct = re.escape(".,:;!?") + self._breakD = re.compile(f"{narrator}.*?(?:{narrator}[{punct}]?|$)", re.UNICODE) + self._breakQ = re.compile(f"{narrator}.*?(?:{narrator}[{punct}]?)", re.UNICODE) + self._narrator = narrator + self._mode = f" {narrator}" + return def __call__(self, text: str) -> list[tuple[int, int]]: """Caller wrapper for dialogue processing.""" temp: list[int] = [] + result: list[tuple[int, int]] = [] if text: plain = True if self._dialog and text[0] in self._dialog: @@ -153,44 +168,41 @@ def __call__(self, text: str) -> list[tuple[int, int]]: plain = False temp.append(0) temp.append(len(text)) - if self._narrator: + if self._breakD: # Process narrator breaks in the dialogue - for res in self._break.finditer(text, 1): + for res in self._breakD.finditer(text, 1): temp.append(res.start(0)) - if (two := res.group(2)) and two[0].isspace(): - temp.append(res.start(2)) - else: - temp.append(res.end(0)) + temp.append(res.end(0)) elif self._quotes: - # The line contains quoted dialogue + # Quoted dialogue is enabled, so we look for them for res in self._quotes.finditer(text): plain = False temp.append(res.start(0)) temp.append(res.end(0)) - if self._narrator: - for res in self._break.finditer(text, 1): - temp.append(res.start(0)) - if (two := res.group(2)) and two[0].isspace(): - temp.append(res.start(2)) - else: - temp.append(res.end(0)) + if self._breakQ: + for sub in self._breakQ.finditer(text, res.start(0), res.end(0)): + temp.append(sub.start(0)) + temp.append(sub.end(0)) if plain and self._alternate: + # The main rules found no dialogue, so we check for + # alternating dialogue sections, if enabled pos = 0 for num, bit in enumerate(text.split(self._alternate)): - length = len(bit) + int(num > 0) + length = len(bit) + (1 if num > 0 else 0) if num%2: temp.append(pos) temp.append(pos + length) pos += length - start = None - result = [] - for pos in sorted(set(temp)): - if start is None: - start = pos - else: - result.append((start, pos)) + if temp: + # Sort unique edges in increasing order, and add them in pairs start = None + for pos in sorted(set(temp)): + if start is None: + start = pos + else: + result.append((start, pos)) + start = None return result diff --git a/tests/test_text/test_text_patterns.py b/tests/test_text/test_text_patterns.py index 2ace18bff..62b911b19 100644 --- a/tests/test_text/test_text_patterns.py +++ b/tests/test_text/test_text_patterns.py @@ -395,9 +395,9 @@ def testTextPatterns_DialogParserEnglish(): CONFIG.narratorBreak = nwUnicode.U_EMDASH parser.initParser() - # Positions: 0 18 32 58 + # Positions: 0 18 34 58 assert parser("“Simple dialogue, — argued John, — is not always so easy.”") == [ - (0, 18), (32, 58), + (0, 18), (34, 58), ] # Positions: 0 18 32 56 @@ -405,6 +405,11 @@ def testTextPatterns_DialogParserEnglish(): (0, 18), (32, 56), ] + # Positions: 0 31 + assert parser("“Simple dialogue, —argued John”") == [ + (0, 31), + ] + @pytest.mark.core def testTextPatterns_DialogParserSpanish(): @@ -462,9 +467,14 @@ def testTextPatterns_DialogParserPortuguese(): (0, 12), ] - # Positions: 0 12 27 49 + # Positions: 0 12 29 49 assert parser("— Tudo bem? — ele pergunta. — Você falou com ele?") == [ - (0, 12), (27, 49), + (0, 12), (29, 49), + ] + + # Positions: 0 12 29 49 + assert parser("— Tudo bem? — ele pergunta —. Você falou com ele?") == [ + (0, 12), (29, 49), ] From e4a2ca2d1933fdf00d0825bef7fb2474bd803684 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Mon, 4 Nov 2024 17:32:22 +0100 Subject: [PATCH 2/2] Add horizontal bar to em dash conversion in tokenizer --- novelwriter/formats/tokenizer.py | 8 +++++--- novelwriter/text/patterns.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/novelwriter/formats/tokenizer.py b/novelwriter/formats/tokenizer.py index e63fb1500..51bfaef12 100644 --- a/novelwriter/formats/tokenizer.py +++ b/novelwriter/formats/tokenizer.py @@ -493,9 +493,11 @@ def doPreProcessing(self) -> None: xRep = re.compile("|".join([re.escape(k) for k in repDict.keys()]), flags=re.DOTALL) self._text = xRep.sub(lambda x: repDict[x.group(0)], self._text) - # Process the character translation map - trDict = {nwUnicode.U_MAPOS: nwUnicode.U_RSQUO} - self._text = self._text.translate(str.maketrans(trDict)) + # Process the translation map for placeholder characters + self._text = self._text.translate(str.maketrans({ + nwUnicode.U_MAPOS: nwUnicode.U_RSQUO, + nwUnicode.U_HBAR: nwUnicode.U_EMDASH, + })) return diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 1f7948e8f..55d813f7c 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -3,7 +3,8 @@ ==================================== File History: -Created: 2024-06-01 [2.5ec1] +Created: 2024-06-01 [2.5rc1] RegExPatterns +Created: 2024-11-04 [2.6b1] DialogParser This file is a part of novelWriter Copyright 2018–2024, Veronica Berglyd Olsen