From 4a5214c3470923cca3c462f9a65437b7452101a6 Mon Sep 17 00:00:00 2001
From: 5j9 <5j9@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:12:19 +0330
Subject: [PATCH] chore(_balanced_quotes_shadow): rewrite to return spans
This branch was intended to improves bolds/italics performance (#132),
but my test results do not show meaningful enough performance improvements
to convince me to merge it into main branch.
---
tests/wikitext/test_get_bolds_and_italics.py | 45 +++--
wikitextparser/_comment_bold_italic.py | 2 +-
wikitextparser/_parameter.py | 2 +-
wikitextparser/_parser_function.py | 2 +-
wikitextparser/_tag.py | 2 +-
wikitextparser/_template.py | 2 +-
wikitextparser/_wikilink.py | 2 +-
wikitextparser/_wikitext.py | 181 ++++++++++---------
8 files changed, 130 insertions(+), 108 deletions(-)
diff --git a/tests/wikitext/test_get_bolds_and_italics.py b/tests/wikitext/test_get_bolds_and_italics.py
index 11b3720..5f4e27e 100644
--- a/tests/wikitext/test_get_bolds_and_italics.py
+++ b/tests/wikitext/test_get_bolds_and_italics.py
@@ -21,19 +21,10 @@ def test_get_bolds():
assert_bold("'''b'''", "'''b'''")
assert_no_bold("''i1'''s")
assert_no_bold("")
- assert_bold(
- "a'''" "b'''d",
- "'''b'''",
- )
assert_bold("'''b{{a|'''}}", "'''b{{a|'''}}") # ?
assert_bold("a'''b{{text|c|d}}e'''f", "'''b{{text|c|d}}e'''")
assert_bold("{{text|'''b'''}}", "'''b'''")
assert_bold("{{text|'''b}}", "'''b") # ?
- assert_bold("{{{PARAM|'''b}}} c", "'''b") # ?
- assert (
- repr(parse("'''b\na'''c").get_bolds())
- == """[Bold("'''b"), Bold("'''c")]"""
- )
assert_bold("'''b'''", "'''b'''")
assert_bold("'''br'''c", "'''br'''")
assert_bold("'''''b'''i", "'''b'''")
@@ -53,19 +44,37 @@ def test_get_bolds():
assert_bold("{{text|{{text|'''b'''}}}}", "'''b'''")
-def test_no_end_in_wikilink():
+def test_hald_bolds_with_newline_in_between():
+ assert (
+ repr(parse("'''b\na'''c").get_bolds())
+ == """[Bold("'''b"), Bold("'''c")]"""
+ )
+
+
+def test_half_bold_in_param():
+ assert_bold("{{{PARAM|'''b}}} c", "'''b") # ?
+
+
+def test_half_bold_in_wikilink():
assert_bold("[[a|'''b]] c", "'''b")
-def test_get_italics():
- def ai(s: str, o: str, r: bool = True):
- italics = parse(s).get_italics(r)
- assert len(italics) == 1
- assert italics[0].string == o
+def test_comment_before_and_after_bold():
+ assert_bold(
+ "a'''" "b'''d",
+ "'''b'''",
+ )
+
+def ai(s: str, o: str, r: bool = True):
+ italics = parse(s).get_italics(r)
+ assert len(italics) == 1
+ assert italics[0].string == o
+
+
+def test_get_italics():
ai("''i'''", "''i'''")
ai("a''' '' b '' '''c", "'' b ''")
- ai("'''''i'''''", "'''''i'''''")
ai("a'' ''' ib ''' ''c", "'' ''' ib ''' ''")
ai("''i''", "''i''")
ai(
@@ -81,6 +90,10 @@ def ai(s: str, o: str, r: bool = True):
ai("''' ''i'''", "''i'''")
+def test_get_italics_2():
+ ai("'''''i'''''", "'''''i'''''")
+
+
def test_bold_italic_index_change():
p = parse("'''b1''' ''i1'' '''b2'''")
b1, b2 = p.get_bolds(recursive=False)
diff --git a/wikitextparser/_comment_bold_italic.py b/wikitextparser/_comment_bold_italic.py
index 607685d..c333cf7 100644
--- a/wikitextparser/_comment_bold_italic.py
+++ b/wikitextparser/_comment_bold_italic.py
@@ -49,7 +49,7 @@ def text(self, s: str):
self[b:e] = s
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
# noinspection PyUnresolvedReferences
return self._match.span(1)
diff --git a/wikitextparser/_parameter.py b/wikitextparser/_parameter.py
index f8055fa..2336151 100644
--- a/wikitextparser/_parameter.py
+++ b/wikitextparser/_parameter.py
@@ -104,5 +104,5 @@ def parameters(self) -> List['Parameter']:
return super().parameters[1:]
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
return 3, -3
diff --git a/wikitextparser/_parser_function.py b/wikitextparser/_parser_function.py
index d336961..066adff 100644
--- a/wikitextparser/_parser_function.py
+++ b/wikitextparser/_parser_function.py
@@ -19,7 +19,7 @@ class SubWikiTextWithArgs(SubWikiText):
_first_arg_sep = 0
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
return 2, -2
@property
diff --git a/wikitextparser/_tag.py b/wikitextparser/_tag.py
index 7fa8a65..2bd5bb5 100644
--- a/wikitextparser/_tag.py
+++ b/wikitextparser/_tag.py
@@ -214,6 +214,6 @@ def get_tags(self, name=None) -> List['Tag']:
return super().get_tags(name)[1:]
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
s = self.string
return s.find('>') + 1, s.rfind('<')
diff --git a/wikitextparser/_template.py b/wikitextparser/_template.py
index 8d10530..930be49 100644
--- a/wikitextparser/_template.py
+++ b/wikitextparser/_template.py
@@ -29,7 +29,7 @@ class Template(SubWikiTextWithArgs):
_first_arg_sep = 124
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
return 2, -2
def normal_name(
diff --git a/wikitextparser/_wikilink.py b/wikitextparser/_wikilink.py
index befd977..2b474af 100644
--- a/wikitextparser/_wikilink.py
+++ b/wikitextparser/_wikilink.py
@@ -22,7 +22,7 @@ class WikiLink(SubWikiText):
__slots__ = '_cached_match'
@property
- def _content_span(self) -> Tuple[int, int]:
+ def _relative_content_span(self) -> Tuple[int, int]:
s = self.string
f = s.find
rf = s.rfind
diff --git a/wikitextparser/_wikitext.py b/wikitextparser/_wikitext.py
index 9a1b449..2a2ef47 100644
--- a/wikitextparser/_wikitext.py
+++ b/wikitextparser/_wikitext.py
@@ -1,4 +1,4 @@
-from bisect import bisect_left, bisect_right, insort_right
+from bisect import bisect_left, bisect_right, insort_left, insort_right
from html import unescape
from itertools import compress, islice
from operator import attrgetter
@@ -114,31 +114,7 @@
).finditer
BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens
- rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)""",
- MULTILINE | VERBOSE,
-).finditer
-
-BOLD_FINDITER = rc(
- rb"""
- # start token
- '\0*+'\0*+'
- # content
- (\0*+[^'\n]++.*?)
- # end token
- (?:'\0*+'\0*+'|$)
-""",
- MULTILINE | VERBOSE,
-).finditer
-
-ITALIC_FINDITER = rc(
- rb"""
- # start token
- '\0*+'
- # content
- (\0*+[^'\n]++.*?)
- # end token
- (?:'\0*+'|$)
-""",
+ rb"""'\0*+(')(?:\0*+('))?+(?:\0*(')\0*')?+(?=[^']|$)""",
MULTILINE | VERBOSE,
).finditer
@@ -206,6 +182,9 @@ def _table_to_text(t: 'Table') -> str:
)
+_MarkupSpans = List[tuple[int, int]]
+
+
class WikiText:
# In subclasses of WikiText _type is used as the key for _type_to_spans
# Therefore: self._span can be found in self._type_to_spans[self._type].
@@ -575,8 +554,8 @@ def _nesting_level(self, parent_types) -> int:
return level
@property
- def _content_span(self) -> Tuple[int, int]:
- # return content_start, self_len, self_end
+ def _relative_content_span(self) -> Tuple[int, int]:
+ # return content_start, content_end
return 0, len(self)
@property
@@ -601,7 +580,7 @@ def _shadow(self) -> bytearray:
self._lststr[0][ss:se], 'ascii', 'replace'
)
if self._type in SPAN_PARSER_TYPES:
- cs, ce = self._content_span
+ cs, ce = self._relative_content_span
head = shadow[:cs]
tail = shadow[ce:]
shadow[:cs] = b'_' * cs
@@ -1008,69 +987,105 @@ def comments(self) -> List['Comment']:
]
@property
- def _balanced_quotes_shadow(self) -> bytearray:
- """Return a byte array with non-markup-apostrophes removed.
+ def _bold_italic_marks(
+ self,
+ ) -> tuple[bytearray, _MarkupSpans, _MarkupSpans]:
+ """Return (shadow, bold markup spans, italic markup spans).
The comments at /includes/parser/Parser.php:doQuotes are helpful:
https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
https://phabricator.wikimedia.org/T15227#178834
"""
- bold_matches = []
- odd_italics = False
- odd_bold_italics = False
- shadow_copy = self._shadow[:]
- append_bold = bold_matches.append
+ bold_marks = []
+ italic_marks: _MarkupSpans = []
+ line_probably_bolds: _MarkupSpans = []
+ line_italics: _MarkupSpans = []
+ line_bolds = []
+ shadow = self._shadow
+ find = shadow.find
+ cs, ce = self._relative_content_span
+ if ce < -1:
+ ce += self._span_data[1]
def process_line():
- nonlocal odd_italics
- if odd_italics and (len(bold_matches) + odd_bold_italics) % 2:
- # one of the bold marks needs to be interpreted as italic
+ nonlocal bold_marks, italic_marks, line_bolds
+ if (
+ len(line_italics) % 2
+ and (len(line_bolds) + len(line_probably_bolds)) % 2
+ ):
+ # one of the probably_bolds needs to be interpreted as italic
first_multi_letter_word = first_space = None
- for bold_match in bold_matches:
- bold_start = bold_match.start()
- if shadow_copy[bold_start - 1 : bold_start] == b' ':
+ for i, (lpbs, _) in enumerate(line_probably_bolds):
+ if shadow[lpbs - 1] == 32: # space
if first_space is None:
- first_space = bold_start
+ first_space = i
continue
- if shadow_copy[bold_start - 2 : bold_start - 1] == b' ':
- shadow_copy[bold_start] = 95 # _
+ if shadow[lpbs - 2] == 32: # space
+ s, e = line_probably_bolds.pop(i)
+ insort_left(line_italics, (s + 1, e))
break # first_single_letter_word
if first_multi_letter_word is None:
- first_multi_letter_word = bold_start
+ first_multi_letter_word = i
continue
else: # there was no first_single_letter_word
if first_multi_letter_word is not None:
- shadow_copy[first_multi_letter_word] = 95 # _
+ s, e = line_probably_bolds.pop(first_multi_letter_word)
+ insort_left(line_italics, (s + 1, e))
elif first_space is not None:
- shadow_copy[first_space] = 95 # _
- bold_matches.clear()
- odd_italics = False
+ s, e = line_probably_bolds.pop(first_space)
+ insort_left(line_italics, (s + 1, e))
+
+ line_bolds += line_probably_bolds
+ line_bolds.sort()
+ if len(line_italics) % 2:
+ line_end = find(b'\n', line_italics[-1][1], ce)
+ if line_end == -1:
+ line_end = ce
+ line_italics.append((line_end, line_end))
+ if len(line_bolds) % 2:
+ line_end = find(b'\n', line_bolds[-1][1], ce)
+ if line_end == -1:
+ line_end = ce
+ line_bolds.append((line_end, line_end))
+
+ bold_marks += line_bolds
+ italic_marks += line_italics
+ line_bolds.clear()
+ line_probably_bolds.clear()
+ line_italics.clear()
+
+ def add_bold_italic():
+ line_bolds.append((ms, m.end(2)))
+ line_italics.append((m.start(3), me))
+
+ def add_italic_bold():
+ line_italics.append((ms, m.end(1)))
+ line_bolds.append((m.start(2), me))
last_end = 0
- find = shadow_copy.find
- for m in BOLD_ITALIC_FINDITER(shadow_copy):
- if find(b'\n', last_end, m.start()) > -1: # newline
+ for m in BOLD_ITALIC_FINDITER(shadow, cs, ce):
+ ms, me = span = m.span()
+ if find(b'\n', last_end, ms) > -1: # newline
process_line()
if m[2] is None: # italic
- odd_italics ^= True
+ line_italics.append(span)
elif m[3] is None: # bold
- s, e = m.span(1)
- if s != e: # four apostrophes, hide the first one
- shadow_copy[s] = 95 # _
- append_bold(m)
+ line_probably_bolds.append(span)
else: # bold-italic
- s, e = m.span(1)
- es = e - s
- if es: # more than 5 apostrophes, hide the previous ones
- shadow_copy[s:e] = b'_' * es
- odd_bold_italics ^= True
- odd_italics ^= True
- last_end = m.end()
+ # this part might need more tuning or later correction
+ if len(line_italics) % 2: # odd italics
+ add_bold_italic()
+ else: # even italics
+ add_italic_bold()
+
+ last_end = me
process_line() # string end
- return shadow_copy
+ return shadow, bold_marks, italic_marks
- def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
+ def _bolds_italics_recurse(
+ self, result: List[Union['Bold', 'Italic']], filter_cls: Optional[type]
+ ):
for prop in (
'templates',
'parser_functions',
@@ -1113,19 +1128,19 @@ def get_bolds_and_italics(
s = self._span_data[0]
type_to_spans = self._type_to_spans
tts_setdefault = type_to_spans.setdefault
- balanced_shadow = self._balanced_quotes_shadow
- rs, re = self._content_span
+ shadow, bold_marks, italic_marks = self._bold_italic_marks
if filter_cls is None or filter_cls is Bold:
bold_spans = tts_setdefault('Bold', [])
get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
- bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
- for m in bold_matches:
- ms, me = m.span()
+ bmi = iter(bold_marks)
+
+ for start_mark, end_mark in zip(bmi, bmi):
+ ms, me = start_mark[0], end_mark[1]
b, e = s + ms, s + me
old_span = get_old_bold_span((b, e))
if old_span is None:
- span = [b, e, None, balanced_shadow[ms:me]]
+ span = [b, e, None, shadow[ms:me]]
insort_right(bold_spans, span)
else:
span = old_span
@@ -1137,31 +1152,25 @@ def get_bolds_and_italics(
return result
elif filter_cls is Bold:
return result
- else: # filter_cls is Italic
- bold_matches = BOLD_FINDITER(balanced_shadow, rs, re)
# filter_cls is None or filter_cls is Italic
- # remove bold tokens before searching for italics
- for m in bold_matches:
- ms, me = m.span()
- cs, ce = m.span(1) # content
- balanced_shadow[ms:cs] = b'_' * (cs - ms)
- balanced_shadow[ce:me] = b'_' * (me - ce)
-
italic_spans = tts_setdefault('Italic', [])
get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
- for m in ITALIC_FINDITER(balanced_shadow, rs, re):
- ms, me = m.span()
+ imi = iter(italic_marks)
+ for start_mark, end_mark in zip(imi, imi):
+ ms, me = start_mark[0], end_mark[1]
b, e = span = s + ms, s + me
old_span = get_old_italic_span(span)
if old_span is None:
- span = [b, e, None, balanced_shadow[ms:me]]
+ span = [b, e, None, shadow[ms:me]]
insort_right(italic_spans, span)
else:
span = old_span
append(
- Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
+ Italic(
+ _lststr, type_to_spans, span, 'Italic', end_mark[0] != me
+ )
)
if recursive and filter_cls is Italic:
self._bolds_italics_recurse(result, filter_cls)