Skip to content

Commit

Permalink
Added 'MatcherTransformer.re_flavour' and 'ReplaceTransformer.re_flav…
Browse files Browse the repository at this point in the history
…our' attributes

See #228
  • Loading branch information
vruusmann committed Oct 14, 2024
1 parent 1c4524e commit 1fa5918
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 11 deletions.
10 changes: 6 additions & 4 deletions sklearn2pmml/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,34 +574,36 @@ def transform(self, X):
class MatchesTransformer(BaseEstimator, TransformerMixin):
"""Match RE pattern."""

def __init__(self, pattern):
def __init__(self, pattern, re_flavour = None):
self.pattern = pattern
self.re_flavour = re_flavour

def fit(self, X, y = None):
to_1d(X)
return self

def transform(self, X):
X1d = to_1d(X)
regex_engine = make_regex_engine(self.pattern)
regex_engine = make_regex_engine(self.pattern, self.re_flavour)
func = lambda x: bool(regex_engine.matches(x))
Xt = eval_rows(X1d, func, shape = X.shape)
return Xt

class ReplaceTransformer(BaseEstimator, TransformerMixin):
"""Replace all RE pattern matches."""

def __init__(self, pattern, replacement):
def __init__(self, pattern, replacement, re_flavour = None):
self.pattern = pattern
self.replacement = replacement
self.re_flavour = re_flavour

def fit(self, X, y = None):
to_1d(X)
return self

def transform(self, X):
X1d = to_1d(X)
regex_engine = make_regex_engine(self.pattern)
regex_engine = make_regex_engine(self.pattern, self.re_flavour)
func = lambda x: regex_engine.replace(self.replacement, x)
Xt = eval_rows(X1d, func, shape = X.shape)
return Xt
Expand Down
23 changes: 18 additions & 5 deletions sklearn2pmml/preprocessing/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,22 @@ def matches(self, x):
def replace(self, replacement, x):
return self.pattern_.substitute(replacement, x)

def make_regex_engine(pattern):
try:
def make_regex_engine(pattern, re_flavour):
if re_flavour is None:
try:
import pcre2

re_flavour = "pcre2"
except ImportError:
warnings.warn("Perl Compatible Regular Expressions (PCRE) library is not available, falling back to built-in Regular Expressions (RE) library. Transformation results might not be reproducible between Python and PMML environments when using more complex patterns", Warning)
re_flavour = "re"

if re_flavour == "pcre":
return PCREEngine(pattern)
except ImportError:
warnings.warn("Perl Compatible Regular Expressions (PCRE) library is not available, falling back to built-in Regular Expressions (RE) library. Transformation results might not be reproducible between Python and PMML environments when using more complex patterns", Warning)
return REEngine(pattern)
elif re_flavour == "pcre2":
return PCRE2Engine(pattern)
elif re_flavour == "re":
return REEngine(pattern)
else:
re_flavours = ["pcre", "pcre2", "re"]
raise ValueError("Regular Expressions flavour {0} not in {1}".format(re_flavour, re_flavours))
4 changes: 2 additions & 2 deletions sklearn2pmml/preprocessing/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,14 +648,14 @@ class MatchesTransformerTest(TestCase):

def test_transform(self):
X = numpy.asarray(["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
transformer = MatchesTransformer("ar?y")
transformer = MatchesTransformer("ar?y", re_flavour = "re")
self.assertEqual([True, True, False, False, True, False, False, False, False, False, False, False], transformer.transform(X).tolist())

class ReplaceTransformerTest(TestCase):

def test_transform(self):
X = numpy.asarray(["A", "B", "BA", "BB", "BAB", "ABBA", "BBBB"])
transformer = ReplaceTransformer("B+", "c")
transformer = ReplaceTransformer("B+", "c", re_flavour = "re")
self.assertEqual(["A", "c", "cA", "c", "cAc", "AcA", "c"], transformer.transform(X).tolist())
vectorizer = CountVectorizer()
pipeline = make_pipeline(transformer, vectorizer)
Expand Down

0 comments on commit 1fa5918

Please sign in to comment.