From 89f452f3fcd87177dd2cb5e3506546060c251a0e Mon Sep 17 00:00:00 2001
From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 17:28:21 +0100
Subject: [PATCH 1/2] Restructure and improve dialogue highlighting

---
 novelwriter/text/patterns.py          | 76 ++++++++++++++++-----------
 tests/test_text/test_text_patterns.py | 18 +++++--
 2 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py
index 7fd916dad..1f7948e8f 100644
--- a/novelwriter/text/patterns.py
+++ b/novelwriter/text/patterns.py
@@ -114,15 +114,20 @@ def altDialogStyle(self) -> re.Pattern | None:
 
 class DialogParser:
 
-    __slots__ = ("_quotes", "_dialog", "_narrator", "_alternate", "_break", "_enabled")
+    __slots__ = (
+        "_quotes", "_dialog", "_alternate", "_enabled",
+        "_narrator", "_breakD", "_breakQ", "_mode",
+    )
 
     def __init__(self) -> None:
         self._quotes = None
         self._dialog = ""
-        self._narrator = ""
         self._alternate = ""
-        self._break = re.compile("")
         self._enabled = False
+        self._narrator = ""
+        self._breakD = None
+        self._breakQ = None
+        self._mode = ""
         return
 
     @property
@@ -131,21 +136,31 @@ def enabled(self) -> bool:
         return self._enabled
 
     def initParser(self) -> None:
-        """Init parser settings. Must be called when config changes."""
-        punct = re.escape("!?.,:;")
+        """Init parser settings. This method must also be called when
+        the config changes.
+        """
         self._quotes = REGEX_PATTERNS.dialogStyle
         self._dialog = uniqueCompact(CONFIG.dialogLine)
-        self._narrator = CONFIG.narratorBreak.strip()[:1]
         self._alternate = CONFIG.narratorDialog.strip()[:1]
-        self._break = re.compile(
-            f"({self._narrator}\\s?.*?)(\\s?(?:{self._narrator}[{punct}]?|$))", re.UNICODE
-        )
-        self._enabled = bool(self._quotes or self._dialog or self._narrator or self._alternate)
+
+        # One of the three modes are needed for the class to have
+        # anything to do
+        self._enabled = bool(self._quotes or self._dialog or self._alternate)
+
+        # Build narrator break RegExes
+        if narrator := CONFIG.narratorBreak.strip()[:1]:
+            punct = re.escape(".,:;!?")
+            self._breakD = re.compile(f"{narrator}.*?(?:{narrator}[{punct}]?|$)", re.UNICODE)
+            self._breakQ = re.compile(f"{narrator}.*?(?:{narrator}[{punct}]?)", re.UNICODE)
+            self._narrator = narrator
+            self._mode = f" {narrator}"
+
         return
 
     def __call__(self, text: str) -> list[tuple[int, int]]:
         """Caller wrapper for dialogue processing."""
         temp: list[int] = []
+        result: list[tuple[int, int]] = []
         if text:
             plain = True
             if self._dialog and text[0] in self._dialog:
@@ -153,44 +168,41 @@ def __call__(self, text: str) -> list[tuple[int, int]]:
                 plain = False
                 temp.append(0)
                 temp.append(len(text))
-                if self._narrator:
+                if self._breakD:
                     # Process narrator breaks in the dialogue
-                    for res in self._break.finditer(text, 1):
+                    for res in self._breakD.finditer(text, 1):
                         temp.append(res.start(0))
-                        if (two := res.group(2)) and two[0].isspace():
-                            temp.append(res.start(2))
-                        else:
-                            temp.append(res.end(0))
+                        temp.append(res.end(0))
             elif self._quotes:
-                # The line contains quoted dialogue
+                # Quoted dialogue is enabled, so we look for them
                 for res in self._quotes.finditer(text):
                     plain = False
                     temp.append(res.start(0))
                     temp.append(res.end(0))
-                    if self._narrator:
-                        for res in self._break.finditer(text, 1):
-                            temp.append(res.start(0))
-                            if (two := res.group(2)) and two[0].isspace():
-                                temp.append(res.start(2))
-                            else:
-                                temp.append(res.end(0))
+                    if self._breakQ:
+                        for sub in self._breakQ.finditer(text, res.start(0), res.end(0)):
+                            temp.append(sub.start(0))
+                            temp.append(sub.end(0))
 
             if plain and self._alternate:
+                # The main rules found no dialogue, so we check for
+                # alternating dialogue sections, if enabled
                 pos = 0
                 for num, bit in enumerate(text.split(self._alternate)):
-                    length = len(bit) + int(num > 0)
+                    length = len(bit) + (1 if num > 0 else 0)
                     if num%2:
                         temp.append(pos)
                         temp.append(pos + length)
                     pos += length
 
-        start = None
-        result = []
-        for pos in sorted(set(temp)):
-            if start is None:
-                start = pos
-            else:
-                result.append((start, pos))
+            if temp:
+                # Sort unique edges in increasing order, and add them in pairs
                 start = None
+                for pos in sorted(set(temp)):
+                    if start is None:
+                        start = pos
+                    else:
+                        result.append((start, pos))
+                        start = None
 
         return result
diff --git a/tests/test_text/test_text_patterns.py b/tests/test_text/test_text_patterns.py
index 2ace18bff..62b911b19 100644
--- a/tests/test_text/test_text_patterns.py
+++ b/tests/test_text/test_text_patterns.py
@@ -395,9 +395,9 @@ def testTextPatterns_DialogParserEnglish():
     CONFIG.narratorBreak = nwUnicode.U_EMDASH
     parser.initParser()
 
-    # Positions:   0                 18            32                        58
+    # Positions:   0                 18              34                      58
     assert parser("“Simple dialogue, — argued John, — is not always so easy.”") == [
-        (0, 18), (32, 58),
+        (0, 18), (34, 58),
     ]
 
     # Positions:   0                 18            32                      56
@@ -405,6 +405,11 @@ def testTextPatterns_DialogParserEnglish():
         (0, 18), (32, 56),
     ]
 
+    # Positions:   0                              31
+    assert parser("“Simple dialogue, —argued John”") == [
+        (0, 31),
+    ]
+
 
 @pytest.mark.core
 def testTextPatterns_DialogParserSpanish():
@@ -462,9 +467,14 @@ def testTextPatterns_DialogParserPortuguese():
         (0, 12),
     ]
 
-    # Positions:   0           12             27                    49
+    # Positions:   0           12               29                  49
     assert parser("— Tudo bem? — ele pergunta. — Você falou com ele?") == [
-        (0, 12), (27, 49),
+        (0, 12), (29, 49),
+    ]
+
+    # Positions:   0           12               29                  49
+    assert parser("— Tudo bem? — ele pergunta —. Você falou com ele?") == [
+        (0, 12), (29, 49),
     ]
 
 

From e4a2ca2d1933fdf00d0825bef7fb2474bd803684 Mon Sep 17 00:00:00 2001
From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com>
Date: Mon, 4 Nov 2024 17:32:22 +0100
Subject: [PATCH 2/2] Add horizontal bar to em dash conversion in tokenizer

---
 novelwriter/formats/tokenizer.py | 8 +++++---
 novelwriter/text/patterns.py     | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/novelwriter/formats/tokenizer.py b/novelwriter/formats/tokenizer.py
index e63fb1500..51bfaef12 100644
--- a/novelwriter/formats/tokenizer.py
+++ b/novelwriter/formats/tokenizer.py
@@ -493,9 +493,11 @@ def doPreProcessing(self) -> None:
             xRep = re.compile("|".join([re.escape(k) for k in repDict.keys()]), flags=re.DOTALL)
             self._text = xRep.sub(lambda x: repDict[x.group(0)], self._text)
 
-        # Process the character translation map
-        trDict = {nwUnicode.U_MAPOS: nwUnicode.U_RSQUO}
-        self._text = self._text.translate(str.maketrans(trDict))
+        # Process the translation map for placeholder characters
+        self._text = self._text.translate(str.maketrans({
+            nwUnicode.U_MAPOS: nwUnicode.U_RSQUO,
+            nwUnicode.U_HBAR: nwUnicode.U_EMDASH,
+        }))
 
         return
 
diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py
index 1f7948e8f..55d813f7c 100644
--- a/novelwriter/text/patterns.py
+++ b/novelwriter/text/patterns.py
@@ -3,7 +3,8 @@
 ====================================
 
 File History:
-Created: 2024-06-01 [2.5ec1]
+Created: 2024-06-01 [2.5rc1] RegExPatterns
+Created: 2024-11-04 [2.6b1]  DialogParser
 
 This file is a part of novelWriter
 Copyright 2018–2024, Veronica Berglyd Olsen