Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode characters support in regex #233

Merged
merged 4 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions automata/fa/nfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"""Classes and methods for working with nondeterministic finite automata."""
from __future__ import annotations

import string
from collections import deque
from itertools import chain, count, product, repeat
from typing import (
Expand Down Expand Up @@ -37,7 +36,6 @@
NFAPathT = Mapping[str, AbstractSet[NFAStateT]]
NFATransitionsT = Mapping[NFAStateT, NFAPathT]
InputPathListT = List[Tuple[NFAStateT, NFAStateT, str]]
DEFAULT_REGEX_SYMBOLS = frozenset(chain(string.ascii_letters, string.digits))


class NFA(fa.FA):
Expand Down Expand Up @@ -211,7 +209,7 @@ def from_regex(
The regex to construct an equivalent NFA for.
input_symbols : Optional[AbstractSet[str]], default: None
The set of input symbols to create the NFA over. If not
set, defaults to all ascii letters and digits.
set, defaults to all letters found in the regex.
eliotwrobson marked this conversation as resolved.
Show resolved Hide resolved

Returns
------
Expand All @@ -220,7 +218,7 @@ def from_regex(
"""

if input_symbols is None:
input_symbols = DEFAULT_REGEX_SYMBOLS
input_symbols = frozenset(regex) - RESERVED_CHARACTERS
else:
conflicting_symbols = RESERVED_CHARACTERS & input_symbols
if conflicting_symbols:
Expand Down
6 changes: 3 additions & 3 deletions automata/regex/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,9 +542,6 @@ def get_regex_lexer(

lexer.register_token(LeftParen.from_match, r"\(")
lexer.register_token(RightParen.from_match, r"\)")
lexer.register_token(
lambda match: StringToken(match.group(), state_name_counter), r"[A-Za-z0-9]"
)
lexer.register_token(UnionToken.from_match, r"\|")
lexer.register_token(IntersectionToken.from_match, r"\&")
lexer.register_token(ShuffleToken.from_match, r"\^")
Expand All @@ -556,6 +553,9 @@ def get_regex_lexer(
lambda match: WildcardToken(match.group(), input_symbols, state_name_counter),
r"\.",
)
lexer.register_token(
lambda match: StringToken(match.group(), state_name_counter), r"\S"
)

return lexer

Expand Down
3 changes: 1 addition & 2 deletions automata/regex/regex.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/usr/bin/env python3
"""
A set of tools for working with regular expressions. Can recognize regular expressions
over the alphabet of ascii letters (lower and upper case), ascii digits, and subsets of
these.
over the alphabet of unicode characters and subsets of these, excluding blanks.

A regular expression with the following operations only are supported in this library:

Expand Down
9 changes: 9 additions & 0 deletions tests/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ def test_helper_validate_invalid(self) -> None:
"""Should pass validation for valid regular expression"""
self.assertFalse(re._validate("a(|)"))

def test_validate_unicode_characters(self) -> None:
"""Should pass validation for regular expressions with unicode characters."""
re.validate("(µ|🤖ù)*")

eliotwrobson marked this conversation as resolved.
Show resolved Hide resolved
def test_unicode_input_symbols(self) -> None:
"""Should have correct unicode input symbols."""
nfa = NFA.from_regex("(µ🔥|🔥✨?)*")
self.assertEqual(nfa.input_symbols, {"µ", "🔥", "✨"})

def test_isequal(self) -> None:
"""Should correctly check equivalence of two regular expressions"""

Expand Down
Loading