From 4a5214c3470923cca3c462f9a65437b7452101a6 Mon Sep 17 00:00:00 2001 From: 5j9 <5j9@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:12:19 +0330 Subject: [PATCH] chore(_balanced_quotes_shadow): rewrite to return spans This branch was intended to improves bolds/italics performance (#132), but my test results do not show meaningful enough performance improvements to convince me to merge it into main branch. --- tests/wikitext/test_get_bolds_and_italics.py | 45 +++-- wikitextparser/_comment_bold_italic.py | 2 +- wikitextparser/_parameter.py | 2 +- wikitextparser/_parser_function.py | 2 +- wikitextparser/_tag.py | 2 +- wikitextparser/_template.py | 2 +- wikitextparser/_wikilink.py | 2 +- wikitextparser/_wikitext.py | 181 ++++++++++--------- 8 files changed, 130 insertions(+), 108 deletions(-) diff --git a/tests/wikitext/test_get_bolds_and_italics.py b/tests/wikitext/test_get_bolds_and_italics.py index 11b3720..5f4e27e 100644 --- a/tests/wikitext/test_get_bolds_and_italics.py +++ b/tests/wikitext/test_get_bolds_and_italics.py @@ -21,19 +21,10 @@ def test_get_bolds(): assert_bold("'''b'''", "'''b'''") assert_no_bold("''i1'''s") assert_no_bold("") - assert_bold( - "a'''" "b'''d", - "'''b'''", - ) assert_bold("'''b{{a|'''}}", "'''b{{a|'''}}") # ? assert_bold("a'''b{{text|c|d}}e'''f", "'''b{{text|c|d}}e'''") assert_bold("{{text|'''b'''}}", "'''b'''") assert_bold("{{text|'''b}}", "'''b") # ? - assert_bold("{{{PARAM|'''b}}} c", "'''b") # ? - assert ( - repr(parse("'''b\na'''c").get_bolds()) - == """[Bold("'''b"), Bold("'''c")]""" - ) assert_bold("'''b'''", "'''b'''") assert_bold("'''br'''c", "'''br'''") assert_bold("'''''b'''i", "'''b'''") @@ -53,19 +44,37 @@ def test_get_bolds(): assert_bold("{{text|{{text|'''b'''}}}}", "'''b'''") -def test_no_end_in_wikilink(): +def test_hald_bolds_with_newline_in_between(): + assert ( + repr(parse("'''b\na'''c").get_bolds()) + == """[Bold("'''b"), Bold("'''c")]""" + ) + + +def test_half_bold_in_param(): + assert_bold("{{{PARAM|'''b}}} c", "'''b") # ? + + +def test_half_bold_in_wikilink(): assert_bold("[[a|'''b]] c", "'''b") -def test_get_italics(): - def ai(s: str, o: str, r: bool = True): - italics = parse(s).get_italics(r) - assert len(italics) == 1 - assert italics[0].string == o +def test_comment_before_and_after_bold(): + assert_bold( + "a'''" "b'''d", + "'''b'''", + ) + +def ai(s: str, o: str, r: bool = True): + italics = parse(s).get_italics(r) + assert len(italics) == 1 + assert italics[0].string == o + + +def test_get_italics(): ai("''i'''", "''i'''") ai("a''' '' b '' '''c", "'' b ''") - ai("'''''i'''''", "'''''i'''''") ai("a'' ''' ib ''' ''c", "'' ''' ib ''' ''") ai("''i''", "''i''") ai( @@ -81,6 +90,10 @@ def ai(s: str, o: str, r: bool = True): ai("''' ''i'''", "''i'''") +def test_get_italics_2(): + ai("'''''i'''''", "'''''i'''''") + + def test_bold_italic_index_change(): p = parse("'''b1''' ''i1'' '''b2'''") b1, b2 = p.get_bolds(recursive=False) diff --git a/wikitextparser/_comment_bold_italic.py b/wikitextparser/_comment_bold_italic.py index 607685d..c333cf7 100644 --- a/wikitextparser/_comment_bold_italic.py +++ b/wikitextparser/_comment_bold_italic.py @@ -49,7 +49,7 @@ def text(self, s: str): self[b:e] = s @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: # noinspection PyUnresolvedReferences return self._match.span(1) diff --git a/wikitextparser/_parameter.py b/wikitextparser/_parameter.py index f8055fa..2336151 100644 --- a/wikitextparser/_parameter.py +++ b/wikitextparser/_parameter.py @@ -104,5 +104,5 @@ def parameters(self) -> List['Parameter']: return super().parameters[1:] @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: return 3, -3 diff --git a/wikitextparser/_parser_function.py b/wikitextparser/_parser_function.py index d336961..066adff 100644 --- a/wikitextparser/_parser_function.py +++ b/wikitextparser/_parser_function.py @@ -19,7 +19,7 @@ class SubWikiTextWithArgs(SubWikiText): _first_arg_sep = 0 @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: return 2, -2 @property diff --git a/wikitextparser/_tag.py b/wikitextparser/_tag.py index 7fa8a65..2bd5bb5 100644 --- a/wikitextparser/_tag.py +++ b/wikitextparser/_tag.py @@ -214,6 +214,6 @@ def get_tags(self, name=None) -> List['Tag']: return super().get_tags(name)[1:] @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: s = self.string return s.find('>') + 1, s.rfind('<') diff --git a/wikitextparser/_template.py b/wikitextparser/_template.py index 8d10530..930be49 100644 --- a/wikitextparser/_template.py +++ b/wikitextparser/_template.py @@ -29,7 +29,7 @@ class Template(SubWikiTextWithArgs): _first_arg_sep = 124 @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: return 2, -2 def normal_name( diff --git a/wikitextparser/_wikilink.py b/wikitextparser/_wikilink.py index befd977..2b474af 100644 --- a/wikitextparser/_wikilink.py +++ b/wikitextparser/_wikilink.py @@ -22,7 +22,7 @@ class WikiLink(SubWikiText): __slots__ = '_cached_match' @property - def _content_span(self) -> Tuple[int, int]: + def _relative_content_span(self) -> Tuple[int, int]: s = self.string f = s.find rf = s.rfind diff --git a/wikitextparser/_wikitext.py b/wikitextparser/_wikitext.py index 9a1b449..2a2ef47 100644 --- a/wikitextparser/_wikitext.py +++ b/wikitextparser/_wikitext.py @@ -1,4 +1,4 @@ -from bisect import bisect_left, bisect_right, insort_right +from bisect import bisect_left, bisect_right, insort_left, insort_right from html import unescape from itertools import compress, islice from operator import attrgetter @@ -114,31 +114,7 @@ ).finditer BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens - rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)""", - MULTILINE | VERBOSE, -).finditer - -BOLD_FINDITER = rc( - rb""" - # start token - '\0*+'\0*+' - # content - (\0*+[^'\n]++.*?) - # end token - (?:'\0*+'\0*+'|$) -""", - MULTILINE | VERBOSE, -).finditer - -ITALIC_FINDITER = rc( - rb""" - # start token - '\0*+' - # content - (\0*+[^'\n]++.*?) - # end token - (?:'\0*+'|$) -""", + rb"""'\0*+(')(?:\0*+('))?+(?:\0*(')\0*')?+(?=[^']|$)""", MULTILINE | VERBOSE, ).finditer @@ -206,6 +182,9 @@ def _table_to_text(t: 'Table') -> str: ) +_MarkupSpans = List[tuple[int, int]] + + class WikiText: # In subclasses of WikiText _type is used as the key for _type_to_spans # Therefore: self._span can be found in self._type_to_spans[self._type]. @@ -575,8 +554,8 @@ def _nesting_level(self, parent_types) -> int: return level @property - def _content_span(self) -> Tuple[int, int]: - # return content_start, self_len, self_end + def _relative_content_span(self) -> Tuple[int, int]: + # return content_start, content_end return 0, len(self) @property @@ -601,7 +580,7 @@ def _shadow(self) -> bytearray: self._lststr[0][ss:se], 'ascii', 'replace' ) if self._type in SPAN_PARSER_TYPES: - cs, ce = self._content_span + cs, ce = self._relative_content_span head = shadow[:cs] tail = shadow[ce:] shadow[:cs] = b'_' * cs @@ -1008,69 +987,105 @@ def comments(self) -> List['Comment']: ] @property - def _balanced_quotes_shadow(self) -> bytearray: - """Return a byte array with non-markup-apostrophes removed. + def _bold_italic_marks( + self, + ) -> tuple[bytearray, _MarkupSpans, _MarkupSpans]: + """Return (shadow, bold markup spans, italic markup spans). The comments at /includes/parser/Parser.php:doQuotes are helpful: https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php https://phabricator.wikimedia.org/T15227#178834 """ - bold_matches = [] - odd_italics = False - odd_bold_italics = False - shadow_copy = self._shadow[:] - append_bold = bold_matches.append + bold_marks = [] + italic_marks: _MarkupSpans = [] + line_probably_bolds: _MarkupSpans = [] + line_italics: _MarkupSpans = [] + line_bolds = [] + shadow = self._shadow + find = shadow.find + cs, ce = self._relative_content_span + if ce < -1: + ce += self._span_data[1] def process_line(): - nonlocal odd_italics - if odd_italics and (len(bold_matches) + odd_bold_italics) % 2: - # one of the bold marks needs to be interpreted as italic + nonlocal bold_marks, italic_marks, line_bolds + if ( + len(line_italics) % 2 + and (len(line_bolds) + len(line_probably_bolds)) % 2 + ): + # one of the probably_bolds needs to be interpreted as italic first_multi_letter_word = first_space = None - for bold_match in bold_matches: - bold_start = bold_match.start() - if shadow_copy[bold_start - 1 : bold_start] == b' ': + for i, (lpbs, _) in enumerate(line_probably_bolds): + if shadow[lpbs - 1] == 32: # space if first_space is None: - first_space = bold_start + first_space = i continue - if shadow_copy[bold_start - 2 : bold_start - 1] == b' ': - shadow_copy[bold_start] = 95 # _ + if shadow[lpbs - 2] == 32: # space + s, e = line_probably_bolds.pop(i) + insort_left(line_italics, (s + 1, e)) break # first_single_letter_word if first_multi_letter_word is None: - first_multi_letter_word = bold_start + first_multi_letter_word = i continue else: # there was no first_single_letter_word if first_multi_letter_word is not None: - shadow_copy[first_multi_letter_word] = 95 # _ + s, e = line_probably_bolds.pop(first_multi_letter_word) + insort_left(line_italics, (s + 1, e)) elif first_space is not None: - shadow_copy[first_space] = 95 # _ - bold_matches.clear() - odd_italics = False + s, e = line_probably_bolds.pop(first_space) + insort_left(line_italics, (s + 1, e)) + + line_bolds += line_probably_bolds + line_bolds.sort() + if len(line_italics) % 2: + line_end = find(b'\n', line_italics[-1][1], ce) + if line_end == -1: + line_end = ce + line_italics.append((line_end, line_end)) + if len(line_bolds) % 2: + line_end = find(b'\n', line_bolds[-1][1], ce) + if line_end == -1: + line_end = ce + line_bolds.append((line_end, line_end)) + + bold_marks += line_bolds + italic_marks += line_italics + line_bolds.clear() + line_probably_bolds.clear() + line_italics.clear() + + def add_bold_italic(): + line_bolds.append((ms, m.end(2))) + line_italics.append((m.start(3), me)) + + def add_italic_bold(): + line_italics.append((ms, m.end(1))) + line_bolds.append((m.start(2), me)) last_end = 0 - find = shadow_copy.find - for m in BOLD_ITALIC_FINDITER(shadow_copy): - if find(b'\n', last_end, m.start()) > -1: # newline + for m in BOLD_ITALIC_FINDITER(shadow, cs, ce): + ms, me = span = m.span() + if find(b'\n', last_end, ms) > -1: # newline process_line() if m[2] is None: # italic - odd_italics ^= True + line_italics.append(span) elif m[3] is None: # bold - s, e = m.span(1) - if s != e: # four apostrophes, hide the first one - shadow_copy[s] = 95 # _ - append_bold(m) + line_probably_bolds.append(span) else: # bold-italic - s, e = m.span(1) - es = e - s - if es: # more than 5 apostrophes, hide the previous ones - shadow_copy[s:e] = b'_' * es - odd_bold_italics ^= True - odd_italics ^= True - last_end = m.end() + # this part might need more tuning or later correction + if len(line_italics) % 2: # odd italics + add_bold_italic() + else: # even italics + add_italic_bold() + + last_end = me process_line() # string end - return shadow_copy + return shadow, bold_marks, italic_marks - def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]): + def _bolds_italics_recurse( + self, result: List[Union['Bold', 'Italic']], filter_cls: Optional[type] + ): for prop in ( 'templates', 'parser_functions', @@ -1113,19 +1128,19 @@ def get_bolds_and_italics( s = self._span_data[0] type_to_spans = self._type_to_spans tts_setdefault = type_to_spans.setdefault - balanced_shadow = self._balanced_quotes_shadow - rs, re = self._content_span + shadow, bold_marks, italic_marks = self._bold_italic_marks if filter_cls is None or filter_cls is Bold: bold_spans = tts_setdefault('Bold', []) get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get - bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re)) - for m in bold_matches: - ms, me = m.span() + bmi = iter(bold_marks) + + for start_mark, end_mark in zip(bmi, bmi): + ms, me = start_mark[0], end_mark[1] b, e = s + ms, s + me old_span = get_old_bold_span((b, e)) if old_span is None: - span = [b, e, None, balanced_shadow[ms:me]] + span = [b, e, None, shadow[ms:me]] insort_right(bold_spans, span) else: span = old_span @@ -1137,31 +1152,25 @@ def get_bolds_and_italics( return result elif filter_cls is Bold: return result - else: # filter_cls is Italic - bold_matches = BOLD_FINDITER(balanced_shadow, rs, re) # filter_cls is None or filter_cls is Italic - # remove bold tokens before searching for italics - for m in bold_matches: - ms, me = m.span() - cs, ce = m.span(1) # content - balanced_shadow[ms:cs] = b'_' * (cs - ms) - balanced_shadow[ce:me] = b'_' * (me - ce) - italic_spans = tts_setdefault('Italic', []) get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get - for m in ITALIC_FINDITER(balanced_shadow, rs, re): - ms, me = m.span() + imi = iter(italic_marks) + for start_mark, end_mark in zip(imi, imi): + ms, me = start_mark[0], end_mark[1] b, e = span = s + ms, s + me old_span = get_old_italic_span(span) if old_span is None: - span = [b, e, None, balanced_shadow[ms:me]] + span = [b, e, None, shadow[ms:me]] insort_right(italic_spans, span) else: span = old_span append( - Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1)) + Italic( + _lststr, type_to_spans, span, 'Italic', end_mark[0] != me + ) ) if recursive and filter_cls is Italic: self._bolds_italics_recurse(result, filter_cls)