caleb531 · eliotwrobson · Sep 23, 2024 · Jul 18, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/automata/fa/nfa.py b/automata/fa/nfa.py
@@ -2,7 +2,6 @@
 """Classes and methods for working with nondeterministic finite automata."""
 from __future__ import annotations
 
-import string
 from collections import deque
 from itertools import chain, count, product, repeat
 from typing import (
@@ -37,7 +36,6 @@
 NFAPathT = Mapping[str, AbstractSet[NFAStateT]]
 NFATransitionsT = Mapping[NFAStateT, NFAPathT]
 InputPathListT = List[Tuple[NFAStateT, NFAStateT, str]]
-DEFAULT_REGEX_SYMBOLS = frozenset(chain(string.ascii_letters, string.digits))
 
 
 class NFA(fa.FA):
@@ -211,7 +209,7 @@ def from_regex(
             The regex to construct an equivalent NFA for.
         input_symbols : Optional[AbstractSet[str]], default: None
             The set of input symbols to create the NFA over. If not
-            set, defaults to all ascii letters and digits.
+            set, defaults to all letters found in the regex.
 
         Returns
         ------
@@ -220,7 +218,7 @@ def from_regex(
         """
 
         if input_symbols is None:
-            input_symbols = DEFAULT_REGEX_SYMBOLS
+            input_symbols = frozenset(regex) - RESERVED_CHARACTERS
         else:
             conflicting_symbols = RESERVED_CHARACTERS & input_symbols
             if conflicting_symbols:

diff --git a/automata/regex/parser.py b/automata/regex/parser.py
@@ -542,9 +542,6 @@ def get_regex_lexer(
 
     lexer.register_token(LeftParen.from_match, r"\(")
     lexer.register_token(RightParen.from_match, r"\)")
-    lexer.register_token(
-        lambda match: StringToken(match.group(), state_name_counter), r"[A-Za-z0-9]"
-    )
     lexer.register_token(UnionToken.from_match, r"\|")
     lexer.register_token(IntersectionToken.from_match, r"\&")
     lexer.register_token(ShuffleToken.from_match, r"\^")
@@ -556,6 +553,9 @@ def get_regex_lexer(
         lambda match: WildcardToken(match.group(), input_symbols, state_name_counter),
         r"\.",
     )
+    lexer.register_token(
+        lambda match: StringToken(match.group(), state_name_counter), r"\S"
+    )
 
     return lexer
 

diff --git a/automata/regex/regex.py b/automata/regex/regex.py
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
 """
 A set of tools for working with regular expressions. Can recognize regular expressions
-over the alphabet of ascii letters (lower and upper case), ascii digits, and subsets of
-these.
+over the alphabet of unicode characters and subsets of these, excluding blanks.
 
 A regular expression with the following operations only are supported in this library:
 

diff --git a/tests/test_regex.py b/tests/test_regex.py
@@ -43,6 +43,15 @@ def test_helper_validate_invalid(self) -> None:
         """Should pass validation for valid regular expression"""
         self.assertFalse(re._validate("a(|)"))
 
+    def test_validate_unicode_characters(self) -> None:
+        """Should pass validation for regular expressions with unicode characters."""
+        re.validate("(µ|🤖ù)*")
+
+    def test_unicode_input_symbols(self) -> None:
+        """Should have correct unicode input symbols."""
+        nfa = NFA.from_regex("(µ🔥|🔥✨?)*")
+        self.assertEqual(nfa.input_symbols, {"µ", "🔥", "✨"})
+
     def test_isequal(self) -> None:
         """Should correctly check equivalence of two regular expressions"""