From cb8933b493c52b35685b817f053456d1a80d8770 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 16:23:35 +0200 Subject: [PATCH 1/8] Use Python regex for spell checking --- novelwriter/constants.py | 6 +++++ novelwriter/gui/dochighlight.py | 44 +++++++++++++------------------ novelwriter/gui/editordocument.py | 4 +-- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/novelwriter/constants.py b/novelwriter/constants.py index 5186a5d15..e16bfa030 100644 --- a/novelwriter/constants.py +++ b/novelwriter/constants.py @@ -23,6 +23,8 @@ """ from __future__ import annotations +from re import UNICODE, compile + from PyQt5.QtCore import QT_TRANSLATE_NOOP, QCoreApplication from novelwriter.enum import ( @@ -66,6 +68,10 @@ class nwRegEx: FMT_SC = r"(?i)(? None: self.setCurrentBlockUserData(data) if self._spellCheck: - for xPos, xLen in data.spellCheck(text, xOff): - for x in range(xPos, xPos+xLen): + for xPos, xEnd in data.spellCheck(text, xOff): + for x in range(xPos, xEnd): cFmt = self.format(x) cFmt.merge(self._spellErr) self.setFormat(x, 1, cFmt) @@ -492,22 +486,22 @@ def spellCheck(self, text: str, offset: int) -> list[tuple[int, int]]: """ if "[" in text: # Strip shortcodes - for rX in [SPELLSC, SPELLSV]: - rxItt = rX.globalMatch(text, offset) - while rxItt.hasNext(): - rxMatch = rxItt.next() - xPos = rxMatch.capturedStart(0) - xLen = rxMatch.capturedLength(0) - xEnd = rxMatch.capturedEnd(0) - text = text[:xPos] + " "*xLen + text[xEnd:] + for rX in [nwRegEx.RX_FMT_SC, nwRegEx.RX_FMT_SV]: + for match in re.finditer(rX, text[offset:]): + iS = match.start(0) + offset + iE = match.end(0) + offset + if iS >= 0 and iE >= 0: + text = text[:iS] + " "*(iE - iS) + text[iE:] self._spellErrors = [] - rxSpell = SPELLRX.globalMatch(text.replace("_", " "), offset) - while rxSpell.hasNext(): - rxMatch = rxSpell.next() - if not SHARED.spelling.checkWord(rxMatch.captured(0)): - if not rxMatch.captured(0).isnumeric() and not rxMatch.captured(0).isupper(): - self._spellErrors.append( - (rxMatch.capturedStart(0), rxMatch.capturedLength(0)) - ) + checker = SHARED.spelling + for match in re.finditer(nwRegEx.RX_WORDS, text[offset:].replace("_", " ")): + if ( + (word := match.group(0)) + and (iS := match.start(0)) >= 0 + and (iE := match.end(0)) >= 0 + and not (word.isnumeric() or word.isupper() or checker.checkWord(word)) + ): + self._spellErrors.append((iS + offset, iE + offset)) + return self._spellErrors diff --git a/novelwriter/gui/editordocument.py b/novelwriter/gui/editordocument.py index e071658c7..6e46d12dc 100644 --- a/novelwriter/gui/editordocument.py +++ b/novelwriter/gui/editordocument.py @@ -107,8 +107,8 @@ def spellErrorAtPos(self, pos: int) -> tuple[str, int, int, list[str]]: text = block.text() check = pos - block.position() if check >= 0: - for cPos, cLen in data.spellErrors: - cEnd = cPos + cLen + for cPos, cEnd in data.spellErrors: + cLen = cEnd - cPos if cPos <= check <= cEnd: word = text[cPos:cEnd] return word, cPos, cLen, SHARED.spelling.suggestWords(word) From d3799bc30ac5fe68c7af2cc8ab09e4165dc1c1bf Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 16:37:45 +0200 Subject: [PATCH 2/8] Update plain regex patterns to std lib re --- novelwriter/constants.py | 2 +- novelwriter/gui/dochighlight.py | 56 ++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/novelwriter/constants.py b/novelwriter/constants.py index e16bfa030..1905c856e 100644 --- a/novelwriter/constants.py +++ b/novelwriter/constants.py @@ -68,7 +68,7 @@ class nwRegEx: FMT_SC = r"(?i)(? None: self._spellErr = QTextCharFormat() self._hStyles: dict[str, QTextCharFormat] = {} - self._minRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = [] - self._txtRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = [] - self._cmnRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = [] + self._minRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = [] + self._txtRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = [] + self._cmnRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = [] self.initHighlighter() @@ -129,8 +129,7 @@ def initHighlighter(self) -> None: # Multiple or Trailing Spaces if CONFIG.showMultiSpaces: - rxRule = QRegularExpression(r"[ ]{2,}|[ ]*$") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(r"[ ]{2,}|[ ]*$", re.UNICODE) hlRule = { 0: self._hStyles["mspaces"], } @@ -139,8 +138,7 @@ def initHighlighter(self) -> None: self._cmnRules.append((rxRule, hlRule)) # Non-Breaking Spaces - rxRule = QRegularExpression(f"[{nwUnicode.U_NBSP}{nwUnicode.U_THNBSP}]+") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(f"[{nwUnicode.U_NBSP}{nwUnicode.U_THNBSP}]+", re.UNICODE) hlRule = { 0: self._hStyles["nobreak"], } @@ -231,8 +229,7 @@ def initHighlighter(self) -> None: self._cmnRules.append((rxRule, hlRule)) # Alignment Tags - rxRule = QRegularExpression(r"(^>{1,2}|<{1,2}$)") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(r"(^>{1,2}|<{1,2}$)", re.UNICODE) hlRule = { 1: self._hStyles["markup"], } @@ -240,8 +237,7 @@ def initHighlighter(self) -> None: self._txtRules.append((rxRule, hlRule)) # Auto-Replace Tags - rxRule = QRegularExpression(r"<(\S+?)>") - rxRule.setPatternOptions(QRegExUnicode) + rxRule = re.compile(r"<(\S+?)>", re.UNICODE) hlRule = { 0: self._hStyles["replace"], } @@ -403,17 +399,29 @@ def highlightBlock(self, text: str) -> None: if hRules: for rX, hRule in hRules: - rxItt = rX.globalMatch(text, xOff) - while rxItt.hasNext(): - rxMatch = rxItt.next() - for xM, hFmt in hRule.items(): - xPos = rxMatch.capturedStart(xM) - xEnd = rxMatch.capturedEnd(xM) - for x in range(xPos, xEnd): - cFmt = self.format(x) - if cFmt.fontStyleName() != "markup": - cFmt.merge(hFmt) - self.setFormat(x, 1, cFmt) + if isinstance(rX, QRegularExpression): + rxItt = rX.globalMatch(text, xOff) + while rxItt.hasNext(): + rxMatch = rxItt.next() + for xM, hFmt in hRule.items(): + xPos = rxMatch.capturedStart(xM) + xEnd = rxMatch.capturedEnd(xM) + for x in range(xPos, xEnd): + cFmt = self.format(x) + if cFmt.fontStyleName() != "markup": + cFmt.merge(hFmt) + self.setFormat(x, 1, cFmt) + else: + for match in re.finditer(rX, text[xOff:]): + for xM, hFmt in hRule.items(): + # print(f"'{match.group(xM)}'", match.start(xM), match.end(xM)) + xPos = match.start(xM) + xOff + xEnd = match.end(xM) + xOff + for x in range(xPos, xEnd): + cFmt = self.format(x) + if cFmt.fontStyleName() != "markup": + cFmt.merge(hFmt) + self.setFormat(x, 1, cFmt) data = self.currentBlockUserData() if not isinstance(data, TextBlockData): @@ -498,10 +506,8 @@ def spellCheck(self, text: str, offset: int) -> list[tuple[int, int]]: for match in re.finditer(nwRegEx.RX_WORDS, text[offset:].replace("_", " ")): if ( (word := match.group(0)) - and (iS := match.start(0)) >= 0 - and (iE := match.end(0)) >= 0 and not (word.isnumeric() or word.isupper() or checker.checkWord(word)) ): - self._spellErrors.append((iS + offset, iE + offset)) + self._spellErrors.append((match.start(0) + offset, match.end(0) + offset)) return self._spellErrors From c3cb12e5e2021252dcf108e6d99b2720e0f79e69 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 16:56:21 +0200 Subject: [PATCH 3/8] Use stdlib re for Markdown matching --- novelwriter/core/tokenizer.py | 6 ++---- novelwriter/gui/dochighlight.py | 1 - novelwriter/text/patterns.py | 20 ++++++++------------ tests/test_text/test_text_patterns.py | 23 ++++++++++++++++------- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/novelwriter/core/tokenizer.py b/novelwriter/core/tokenizer.py index b977d9621..7bcf594a5 100644 --- a/novelwriter/core/tokenizer.py +++ b/novelwriter/core/tokenizer.py @@ -1109,11 +1109,9 @@ def _extractFormats( # Match Markdown for regEx, fmts in self._rxMarkdown: - rxItt = regEx.globalMatch(text, 0) - while rxItt.hasNext(): - rxMatch = rxItt.next() + for match in re.finditer(regEx, text): temp.extend( - (rxMatch.capturedStart(n), rxMatch.capturedLength(n), fmt, "") + (match.start(n), len(match.group(n)), fmt, "") for n, fmt in enumerate(fmts) if fmt > 0 ) diff --git a/novelwriter/gui/dochighlight.py b/novelwriter/gui/dochighlight.py index cb76e76b0..5cf14e9ae 100644 --- a/novelwriter/gui/dochighlight.py +++ b/novelwriter/gui/dochighlight.py @@ -41,7 +41,6 @@ from novelwriter.core.index import processComment from novelwriter.enum import nwComment from novelwriter.text.patterns import REGEX_PATTERNS -from novelwriter.types import QRegExUnicode logger = logging.getLogger(__name__) diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 4d60222e0..06c76b2a0 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -23,6 +23,8 @@ """ from __future__ import annotations +import re + from PyQt5.QtCore import QRegularExpression from novelwriter import CONFIG @@ -33,25 +35,19 @@ class RegExPatterns: @property - def markdownItalic(self) -> QRegularExpression: + def markdownItalic(self) -> re.Pattern: """Markdown italic style.""" - rxRule = QRegularExpression(nwRegEx.FMT_EI) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return re.compile(nwRegEx.FMT_EI, re.UNICODE) @property - def markdownBold(self) -> QRegularExpression: + def markdownBold(self) -> re.Pattern: """Markdown bold style.""" - rxRule = QRegularExpression(nwRegEx.FMT_EB) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return re.compile(nwRegEx.FMT_EB, re.UNICODE) @property - def markdownStrike(self) -> QRegularExpression: + def markdownStrike(self) -> re.Pattern: """Markdown strikethrough style.""" - rxRule = QRegularExpression(nwRegEx.FMT_ST) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return re.compile(nwRegEx.FMT_ST, re.UNICODE) @property def shortcodePlain(self) -> QRegularExpression: diff --git a/tests/test_text/test_text_patterns.py b/tests/test_text/test_text_patterns.py index 1b4623fd8..b421db52c 100644 --- a/tests/test_text/test_text_patterns.py +++ b/tests/test_text/test_text_patterns.py @@ -20,6 +20,8 @@ """ from __future__ import annotations +import re + import pytest from PyQt5.QtCore import QRegularExpression @@ -32,13 +34,20 @@ def allMatches(regEx: QRegularExpression, text: str) -> list[list[str]]: """Get all matches for a regex.""" result = [] - itt = regEx.globalMatch(text, 0) - while itt.hasNext(): - match = itt.next() - result.append([ - (match.captured(n), match.capturedStart(n), match.capturedEnd(n)) - for n in range(match.lastCapturedIndex() + 1) - ]) + if isinstance(regEx, QRegularExpression): + itt = regEx.globalMatch(text, 0) + while itt.hasNext(): + match = itt.next() + result.append([ + (match.captured(n), match.capturedStart(n), match.capturedEnd(n)) + for n in range(match.lastCapturedIndex() + 1) + ]) + else: + for match in re.finditer(regEx, text): + result.append([ + (match.group(n), match.start(n), match.end(n)) + for n in range((match.lastindex or -1) + 1) + ]) return result From 9c63f621a8a5c8aa1fc0a58cfe432f7306543a24 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:06:57 +0200 Subject: [PATCH 4/8] Use stdlib re for Shortcode matching --- novelwriter/core/tokenizer.py | 20 +++++++------------- novelwriter/text/patterns.py | 25 ++++++++++++++----------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/novelwriter/core/tokenizer.py b/novelwriter/core/tokenizer.py index 7bcf594a5..ef2070741 100644 --- a/novelwriter/core/tokenizer.py +++ b/novelwriter/core/tokenizer.py @@ -1116,27 +1116,21 @@ def _extractFormats( ) # Match Shortcodes - rxItt = self._rxShortCodes.globalMatch(text, 0) - while rxItt.hasNext(): - rxMatch = rxItt.next() + for match in re.finditer(REGEX_PATTERNS.shortcodePlain, text): temp.append(( - rxMatch.capturedStart(1), - rxMatch.capturedLength(1), - self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0), + match.start(1), len(match.group(1)), + self._shortCodeFmt.get(match.group(1).lower(), 0), "", )) # Match Shortcode w/Values - rxItt = self._rxShortCodeVals.globalMatch(text, 0) tHandle = self._handle or "" - while rxItt.hasNext(): - rxMatch = rxItt.next() - kind = self._shortCodeVals.get(rxMatch.captured(1).lower(), 0) + for match in re.finditer(REGEX_PATTERNS.shortcodeValue, text): + kind = self._shortCodeVals.get(match.group(1).lower(), 0) temp.append(( - rxMatch.capturedStart(0), - rxMatch.capturedLength(0), + match.start(0), len(match.group(0)), self.FMT_STRIP if kind == skip else kind, - f"{tHandle}:{rxMatch.captured(2)}", + f"{tHandle}:{match.group(2)}", )) # Match Dialogue diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 06c76b2a0..2e74951b9 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -34,34 +34,37 @@ class RegExPatterns: + # Static RegExes + _rxItalic = re.compile(nwRegEx.FMT_EI, re.UNICODE) + _rxBold = re.compile(nwRegEx.FMT_EB, re.UNICODE) + _rxStrike = re.compile(nwRegEx.FMT_ST, re.UNICODE) + _rxSCPlain = re.compile(nwRegEx.FMT_SC, re.UNICODE) + _rxSCValue = re.compile(nwRegEx.FMT_SV, re.UNICODE) + @property def markdownItalic(self) -> re.Pattern: """Markdown italic style.""" - return re.compile(nwRegEx.FMT_EI, re.UNICODE) + return self._rxItalic @property def markdownBold(self) -> re.Pattern: """Markdown bold style.""" - return re.compile(nwRegEx.FMT_EB, re.UNICODE) + return self._rxBold @property def markdownStrike(self) -> re.Pattern: """Markdown strikethrough style.""" - return re.compile(nwRegEx.FMT_ST, re.UNICODE) + return self._rxStrike @property - def shortcodePlain(self) -> QRegularExpression: + def shortcodePlain(self) -> re.Pattern: """Plain shortcode style.""" - rxRule = QRegularExpression(nwRegEx.FMT_SC) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxSCPlain @property - def shortcodeValue(self) -> QRegularExpression: + def shortcodeValue(self) -> re.Pattern: """Plain shortcode style.""" - rxRule = QRegularExpression(nwRegEx.FMT_SV) - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return self._rxSCValue @property def dialogStyle(self) -> QRegularExpression: From c08033d153be38cd27251f0bc9fabca0c10ebfe0 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:34:04 +0200 Subject: [PATCH 5/8] Use stdlib re for dialogue matching --- novelwriter/core/tokenizer.py | 12 ++++----- novelwriter/gui/dochighlight.py | 34 ++++++++------------------ novelwriter/text/patterns.py | 35 +++++++++------------------ tests/test_text/test_text_patterns.py | 23 +++++------------- 4 files changed, 33 insertions(+), 71 deletions(-) diff --git a/novelwriter/core/tokenizer.py b/novelwriter/core/tokenizer.py index ef2070741..c98d25e06 100644 --- a/novelwriter/core/tokenizer.py +++ b/novelwriter/core/tokenizer.py @@ -33,7 +33,7 @@ from pathlib import Path from time import time -from PyQt5.QtCore import QCoreApplication, QRegularExpression +from PyQt5.QtCore import QCoreApplication from PyQt5.QtGui import QFont from novelwriter import CONFIG @@ -234,7 +234,7 @@ def __init__(self, project: NWProject) -> None: nwShortcode.FOOTNOTE_B: self.FMT_FNOTE, } - self._rxDialogue: list[tuple[QRegularExpression, int, int]] = [] + self._rxDialogue: list[tuple[re.Pattern, int, int]] = [] return @@ -1136,11 +1136,9 @@ def _extractFormats( # Match Dialogue if self._rxDialogue and hDialog: for regEx, fmtB, fmtE in self._rxDialogue: - rxItt = regEx.globalMatch(text, 0) - while rxItt.hasNext(): - rxMatch = rxItt.next() - temp.append((rxMatch.capturedStart(0), 0, fmtB, "")) - temp.append((rxMatch.capturedEnd(0), 0, fmtE, "")) + for match in re.finditer(regEx, text): + temp.append((match.start(0), 0, fmtB, "")) + temp.append((match.end(0), 0, fmtE, "")) # Post-process text and format result = text diff --git a/novelwriter/gui/dochighlight.py b/novelwriter/gui/dochighlight.py index 5cf14e9ae..6eb5a5cd6 100644 --- a/novelwriter/gui/dochighlight.py +++ b/novelwriter/gui/dochighlight.py @@ -29,7 +29,7 @@ from time import time -from PyQt5.QtCore import QRegularExpression, Qt +from PyQt5.QtCore import Qt from PyQt5.QtGui import ( QBrush, QColor, QFont, QSyntaxHighlighter, QTextBlockUserData, QTextCharFormat, QTextDocument @@ -398,29 +398,15 @@ def highlightBlock(self, text: str) -> None: if hRules: for rX, hRule in hRules: - if isinstance(rX, QRegularExpression): - rxItt = rX.globalMatch(text, xOff) - while rxItt.hasNext(): - rxMatch = rxItt.next() - for xM, hFmt in hRule.items(): - xPos = rxMatch.capturedStart(xM) - xEnd = rxMatch.capturedEnd(xM) - for x in range(xPos, xEnd): - cFmt = self.format(x) - if cFmt.fontStyleName() != "markup": - cFmt.merge(hFmt) - self.setFormat(x, 1, cFmt) - else: - for match in re.finditer(rX, text[xOff:]): - for xM, hFmt in hRule.items(): - # print(f"'{match.group(xM)}'", match.start(xM), match.end(xM)) - xPos = match.start(xM) + xOff - xEnd = match.end(xM) + xOff - for x in range(xPos, xEnd): - cFmt = self.format(x) - if cFmt.fontStyleName() != "markup": - cFmt.merge(hFmt) - self.setFormat(x, 1, cFmt) + for match in re.finditer(rX, text[xOff:]): + for xM, hFmt in hRule.items(): + xPos = match.start(xM) + xOff + xEnd = match.end(xM) + xOff + for x in range(xPos, xEnd): + cFmt = self.format(x) + if cFmt.fontStyleName() != "markup": + cFmt.merge(hFmt) + self.setFormat(x, 1, cFmt) data = self.currentBlockUserData() if not isinstance(data, TextBlockData): diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 2e74951b9..59a2eaf12 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -25,11 +25,8 @@ import re -from PyQt5.QtCore import QRegularExpression - from novelwriter import CONFIG from novelwriter.constants import nwRegEx -from novelwriter.types import QRegExUnicode class RegExPatterns: @@ -67,7 +64,7 @@ def shortcodeValue(self) -> re.Pattern: return self._rxSCValue @property - def dialogStyle(self) -> QRegularExpression: + def dialogStyle(self) -> re.Pattern: """Dialogue detection rule based on user settings.""" symO = "" symC = "" @@ -79,34 +76,26 @@ def dialogStyle(self) -> QRegularExpression: symC += CONFIG.fmtDQuoteClose rxEnd = "|$" if CONFIG.allowOpenDial else "" - rxRule = QRegularExpression(f"\\B[{symO}].*?(?:[{symC}]\\B{rxEnd})") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + return re.compile(f"\\B[{symO}].*?(?:[{symC}]\\B{rxEnd})", re.UNICODE) @property - def dialogLine(self) -> QRegularExpression: + def dialogLine(self) -> re.Pattern: """Dialogue line rule based on user settings.""" - sym = QRegularExpression.escape(CONFIG.dialogLine) - rxRule = QRegularExpression(f"^{sym}.*?$") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + sym = re.escape(CONFIG.dialogLine) + return re.compile(f"^{sym}.*?$", re.UNICODE) @property - def narratorBreak(self) -> QRegularExpression: + def narratorBreak(self) -> re.Pattern: """Dialogue narrator break rule based on user settings.""" - sym = QRegularExpression.escape(CONFIG.narratorBreak) - rxRule = QRegularExpression(f"\\B{sym}\\S.*?\\S{sym}\\B") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + sym = re.escape(CONFIG.narratorBreak) + return re.compile(f"\\B{sym}\\S.*?\\S{sym}\\B", re.UNICODE) @property - def altDialogStyle(self) -> QRegularExpression: + def altDialogStyle(self) -> re.Pattern: """Dialogue alternative rule based on user settings.""" - symO = QRegularExpression.escape(CONFIG.altDialogOpen) - symC = QRegularExpression.escape(CONFIG.altDialogClose) - rxRule = QRegularExpression(f"\\B{symO}.*?{symC}\\B") - rxRule.setPatternOptions(QRegExUnicode) - return rxRule + symO = re.escape(CONFIG.altDialogOpen) + symC = re.escape(CONFIG.altDialogClose) + return re.compile(f"\\B{symO}.*?{symC}\\B", re.UNICODE) REGEX_PATTERNS = RegExPatterns() diff --git a/tests/test_text/test_text_patterns.py b/tests/test_text/test_text_patterns.py index b421db52c..48f1e4c8f 100644 --- a/tests/test_text/test_text_patterns.py +++ b/tests/test_text/test_text_patterns.py @@ -24,30 +24,19 @@ import pytest -from PyQt5.QtCore import QRegularExpression - from novelwriter import CONFIG from novelwriter.constants import nwUnicode from novelwriter.text.patterns import REGEX_PATTERNS -def allMatches(regEx: QRegularExpression, text: str) -> list[list[str]]: +def allMatches(regEx: re.Pattern, text: str) -> list[list[str]]: """Get all matches for a regex.""" result = [] - if isinstance(regEx, QRegularExpression): - itt = regEx.globalMatch(text, 0) - while itt.hasNext(): - match = itt.next() - result.append([ - (match.captured(n), match.capturedStart(n), match.capturedEnd(n)) - for n in range(match.lastCapturedIndex() + 1) - ]) - else: - for match in re.finditer(regEx, text): - result.append([ - (match.group(n), match.start(n), match.end(n)) - for n in range((match.lastindex or -1) + 1) - ]) + for match in re.finditer(regEx, text): + result.append([ + (match.group(n), match.start(n), match.end(n)) + for n in range((match.lastindex or 0) + 1) + ]) return result From 37c469dbc757cbee506559f82895b67e662b32dd Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:37:12 +0200 Subject: [PATCH 6/8] Move word split to regex pattern class --- novelwriter/constants.py | 7 +--- novelwriter/gui/dochighlight.py | 10 ++++-- novelwriter/text/patterns.py | 6 ++++ tests/test_text/test_text_patterns.py | 46 +++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/novelwriter/constants.py b/novelwriter/constants.py index 1905c856e..bf4f58c60 100644 --- a/novelwriter/constants.py +++ b/novelwriter/constants.py @@ -23,8 +23,6 @@ """ from __future__ import annotations -from re import UNICODE, compile - from PyQt5.QtCore import QT_TRANSLATE_NOOP, QCoreApplication from novelwriter.enum import ( @@ -62,16 +60,13 @@ class nwConst: class nwRegEx: + WORDS = r"\b[^\s\-\+\/–—\[\]:]+\b" FMT_EI = r"(? list[tuple[int, int]]: """ if "[" in text: # Strip shortcodes - for rX in [nwRegEx.RX_FMT_SC, nwRegEx.RX_FMT_SV]: + for rX in [RX_FMT_SC, RX_FMT_SV]: for match in re.finditer(rX, text[offset:]): iS = match.start(0) + offset iE = match.end(0) + offset @@ -488,7 +492,7 @@ def spellCheck(self, text: str, offset: int) -> list[tuple[int, int]]: self._spellErrors = [] checker = SHARED.spelling - for match in re.finditer(nwRegEx.RX_WORDS, text[offset:].replace("_", " ")): + for match in re.finditer(RX_WORDS, text[offset:].replace("_", " ")): if ( (word := match.group(0)) and not (word.isnumeric() or word.isupper() or checker.checkWord(word)) diff --git a/novelwriter/text/patterns.py b/novelwriter/text/patterns.py index 59a2eaf12..c3a7cf54f 100644 --- a/novelwriter/text/patterns.py +++ b/novelwriter/text/patterns.py @@ -32,12 +32,18 @@ class RegExPatterns: # Static RegExes + _rxWords = re.compile(nwRegEx.WORDS, re.UNICODE) _rxItalic = re.compile(nwRegEx.FMT_EI, re.UNICODE) _rxBold = re.compile(nwRegEx.FMT_EB, re.UNICODE) _rxStrike = re.compile(nwRegEx.FMT_ST, re.UNICODE) _rxSCPlain = re.compile(nwRegEx.FMT_SC, re.UNICODE) _rxSCValue = re.compile(nwRegEx.FMT_SV, re.UNICODE) + @property + def wordSplit(self) -> re.Pattern: + """Split text into words.""" + return self._rxWords + @property def markdownItalic(self) -> re.Pattern: """Markdown italic style.""" diff --git a/tests/test_text/test_text_patterns.py b/tests/test_text/test_text_patterns.py index 48f1e4c8f..7f1def906 100644 --- a/tests/test_text/test_text_patterns.py +++ b/tests/test_text/test_text_patterns.py @@ -40,6 +40,52 @@ def allMatches(regEx: re.Pattern, text: str) -> list[list[str]]: return result +@pytest.mark.core +def testTextPatterns_Words(): + """Test the word split regex.""" + regEx = REGEX_PATTERNS.wordSplit + + # Spaces + assert allMatches(regEx, "one two three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Hyphens + assert allMatches(regEx, "one-two-three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Em Dashes + assert allMatches(regEx, "one\u2014two\u2014three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Em Dashes + assert allMatches(regEx, "one\u2014two\u2014three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Plus + assert allMatches(regEx, "one+two+three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Slash + assert allMatches(regEx, "one/two/three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Brackets + assert allMatches(regEx, "one[two]three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + # Colon + assert allMatches(regEx, "one:two:three") == [ + [("one", 0, 3)], [("two", 4, 7)], [("three", 8, 13)] + ] + + @pytest.mark.core def testTextPatterns_Markdown(): """Test the markdown pattern regexes.""" From 9b08ebbbd5c6d33c5bf5aa089cc462afcb231eb6 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:47:49 +0200 Subject: [PATCH 7/8] Use Python regex for document search tool --- novelwriter/core/coretools.py | 26 ++++++++++++-------------- novelwriter/types.py | 6 +----- tests/test_core/test_core_coretools.py | 2 +- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/novelwriter/core/coretools.py b/novelwriter/core/coretools.py index 08b96957d..5b4dc6d70 100644 --- a/novelwriter/core/coretools.py +++ b/novelwriter/core/coretools.py @@ -27,6 +27,7 @@ from __future__ import annotations import logging +import re import shutil from collections.abc import Iterable @@ -34,7 +35,7 @@ from pathlib import Path from zipfile import ZipFile, is_zipfile -from PyQt5.QtCore import QCoreApplication, QRegularExpression +from PyQt5.QtCore import QCoreApplication from novelwriter import CONFIG, SHARED from novelwriter.common import isHandle, minmax, simplified @@ -297,8 +298,8 @@ def duplicate(self, items: list[str]) -> Iterable[tuple[str, str | None]]: class DocSearch: def __init__(self) -> None: - self._regEx = QRegularExpression() - self.setCaseSensitive(False) + self._regEx = re.compile("") + self._opts = re.UNICODE | re.IGNORECASE self._words = False self._escape = True return @@ -309,10 +310,9 @@ def __init__(self) -> None: def setCaseSensitive(self, state: bool) -> None: """Set the case sensitive search flag.""" - opts = QRegularExpression.PatternOption.UseUnicodePropertiesOption + self._opts = re.UNICODE if not state: - opts |= QRegularExpression.PatternOption.CaseInsensitiveOption - self._regEx.setPatternOptions(opts) + self._opts |= re.IGNORECASE return def setWholeWords(self, state: bool) -> None: @@ -329,8 +329,8 @@ def iterSearch( self, project: NWProject, search: str ) -> Iterable[tuple[NWItem, list[tuple[int, int, str]], bool]]: """Iteratively search through documents in a project.""" - self._regEx.setPattern(self._buildPattern(search)) - logger.debug("Searching with pattern '%s'", self._regEx.pattern()) + self._regEx = re.compile(self._buildPattern(search), self._opts) + logger.debug("Searching with pattern '%s'", self._regEx.pattern) storage = project.storage for item in project.tree: if item.isFileType(): @@ -340,14 +340,12 @@ def iterSearch( def searchText(self, text: str) -> tuple[list[tuple[int, int, str]], bool]: """Search a piece of text for RegEx matches.""" - rxItt = self._regEx.globalMatch(text) count = 0 capped = False results = [] - while rxItt.hasNext(): - rxMatch = rxItt.next() - pos = rxMatch.capturedStart() - num = rxMatch.capturedLength() + for match in re.finditer(self._regEx, text): + pos = match.start(0) + num = len(match.group(0)) lim = text[:pos].rfind("\n") + 1 cut = text[lim:pos].rfind(" ") + lim + 1 context = text[cut:cut+100].partition("\n")[0] @@ -366,7 +364,7 @@ def searchText(self, text: str) -> tuple[list[tuple[int, int, str]], bool]: def _buildPattern(self, search: str) -> str: """Build the search pattern string.""" if self._escape: - search = QRegularExpression.escape(search) + search = re.escape(search) if self._words: search = f"(?:^|\\b){search}(?:$|\\b)" return search diff --git a/novelwriter/types.py b/novelwriter/types.py index 40f22acf2..ab684deef 100644 --- a/novelwriter/types.py +++ b/novelwriter/types.py @@ -23,7 +23,7 @@ """ from __future__ import annotations -from PyQt5.QtCore import QRegularExpression, Qt +from PyQt5.QtCore import Qt from PyQt5.QtGui import QColor, QFont, QPainter, QTextCharFormat, QTextCursor, QTextFormat from PyQt5.QtWidgets import QDialog, QDialogButtonBox, QSizePolicy, QStyle @@ -115,10 +115,6 @@ QtScrollAlwaysOff = Qt.ScrollBarPolicy.ScrollBarAlwaysOff QtScrollAsNeeded = Qt.ScrollBarPolicy.ScrollBarAsNeeded -# Other - -QRegExUnicode = QRegularExpression.PatternOption.UseUnicodePropertiesOption - # Maps FONT_WEIGHTS: dict[int, int] = { diff --git a/tests/test_core/test_core_coretools.py b/tests/test_core/test_core_coretools.py index faf11cfc4..3827db7a9 100644 --- a/tests/test_core/test_core_coretools.py +++ b/tests/test_core/test_core_coretools.py @@ -421,7 +421,7 @@ def testCoreTools_DocSearch(monkeypatch, mockGUI, fncPath, mockRnd, ipsumText): # Patterns # ======== - # Escape Using QRegularExpression + # Escape assert search._buildPattern("[A-Za-z0-9_]+") == r"\[A\-Za\-z0\-9_\]\+" # Whole Words From 17b4a7be4df5232857040f8a37167e829018d819 Mon Sep 17 00:00:00 2001 From: Veronica Berglyd Olsen <1619840+vkbo@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:57:13 +0200 Subject: [PATCH 8/8] Use regex end instead of length in tokenizer format processing function --- novelwriter/core/tokenizer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/novelwriter/core/tokenizer.py b/novelwriter/core/tokenizer.py index c98d25e06..4c90127a2 100644 --- a/novelwriter/core/tokenizer.py +++ b/novelwriter/core/tokenizer.py @@ -1111,14 +1111,14 @@ def _extractFormats( for regEx, fmts in self._rxMarkdown: for match in re.finditer(regEx, text): temp.extend( - (match.start(n), len(match.group(n)), fmt, "") + (match.start(n), match.end(n), fmt, "") for n, fmt in enumerate(fmts) if fmt > 0 ) # Match Shortcodes for match in re.finditer(REGEX_PATTERNS.shortcodePlain, text): temp.append(( - match.start(1), len(match.group(1)), + match.start(1), match.end(1), self._shortCodeFmt.get(match.group(1).lower(), 0), "", )) @@ -1128,7 +1128,7 @@ def _extractFormats( for match in re.finditer(REGEX_PATTERNS.shortcodeValue, text): kind = self._shortCodeVals.get(match.group(1).lower(), 0) temp.append(( - match.start(0), len(match.group(0)), + match.start(0), match.end(0), self.FMT_STRIP if kind == skip else kind, f"{tHandle}:{match.group(2)}", )) @@ -1143,11 +1143,11 @@ def _extractFormats( # Post-process text and format result = text formats = [] - for pos, n, fmt, key in reversed(sorted(temp, key=lambda x: x[0])): + for pos, end, fmt, key in reversed(sorted(temp, key=lambda x: x[0])): if fmt > 0: - if n > 0: - result = result[:pos] + result[pos+n:] - formats = [(p-n if p > pos else p, f, k) for p, f, k in formats] + if end > pos: + result = result[:pos] + result[end:] + formats = [(p+pos-end if p > pos else p, f, k) for p, f, k in formats] formats.insert(0, (pos, fmt, key)) return result, formats