Skip to content

Commit

Permalink
fix spacy example
Browse files Browse the repository at this point in the history
  • Loading branch information
RafalSkolasinski authored and seldondev committed Jun 11, 2020
1 parent d99c56f commit 00c691d
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 264 deletions.
1 change: 1 addition & 0 deletions examples/models/sklearn_spacy_text/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.model
4 changes: 4 additions & 0 deletions examples/models/sklearn_spacy_text/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
FROM seldonio/seldon-core-s2i-python3:1.1.1-rc

RUN pip install spacy
RUN python -m spacy download en_core_web_sm
38 changes: 19 additions & 19 deletions examples/models/sklearn_spacy_text/ml_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import spacy
from spacy.cli import download
import re
import re
import numpy as np
from sklearn.base import TransformerMixin
from html.parser import HTMLParser
import dill
import sys, os

download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm', parser=False, entity=False)
nlp = spacy.load("en_core_web_sm", parser=False, entity=False)


class SpacyTokenTransformer(TransformerMixin):
__symbols = set("!$%^&*()_+|~-=`{}[]:\";'<>?,./-")

Expand All @@ -20,29 +19,30 @@ def transform(self, X, **kwargs):

def fit(self, X, y=None, **fit_params):
return self

@staticmethod
def transform_to_token(text):
str_text = str(text)
doc = nlp(str_text, disable=['parser', 'tagger', 'ner'])
doc = nlp(str_text, disable=["parser", "tagger", "ner"])
tokens = []
for token in doc:
if token.like_url:
clean_token = "URL"
else:
clean_token = token.lemma_.lower().strip()
if len(clean_token) < 1 or clean_token in \
SpacyTokenTransformer.__symbols:
if (
len(clean_token) < 1
or clean_token in SpacyTokenTransformer.__symbols
):
continue
tokens.append(clean_token)
return tokens


class CleanTextTransformer(TransformerMixin):
__html_parser = HTMLParser()
__uplus_pattern = \
re.compile("\<[uU]\+(?P<digit>[a-zA-Z0-9]+)\>")
__markup_link_pattern = \
re.compile("\[(.*)\]\((.*)\)")
__uplus_pattern = re.compile("\<[uU]\+(?P<digit>[a-zA-Z0-9]+)\>")
__markup_link_pattern = re.compile("\[(.*)\]\((.*)\)")

def transform(self, X, **kwargs):
f = np.vectorize(CleanTextTransformer.transform_clean_text)
Expand All @@ -51,25 +51,25 @@ def transform(self, X, **kwargs):

def fit(self, X, y=None, **fit_params):
return self

@staticmethod
def transform_clean_text(raw_text):
try:
decoded = raw_text.encode("ISO-8859-1").decode("utf-8")
except:
decoded = raw_text.encode("ISO-8859-1").decode("cp1252")
html_unescaped = CleanTextTransformer.\
__html_parser.unescape(decoded)
html_unescaped = CleanTextTransformer.__html_parser.unescape(decoded)
html_unescaped = re.sub(r"\r\n", " ", html_unescaped)
html_unescaped = re.sub(r"\r\r\n", " ", html_unescaped)
html_unescaped = re.sub(r"\r", " ", html_unescaped)
html_unescaped = html_unescaped.replace("&gt;", " > ")
html_unescaped = html_unescaped.replace("&lt;", " < ")
html_unescaped = html_unescaped.replace("--", " - ")
html_unescaped = CleanTextTransformer.__uplus_pattern.sub(
" U\g<digit> ", html_unescaped)
" U\g<digit> ", html_unescaped
)
html_unescaped = CleanTextTransformer.__markup_link_pattern.sub(
" \1 \2 ", html_unescaped)
" \1 \2 ", html_unescaped
)
html_unescaped = html_unescaped.replace("\\", "")
return html_unescaped

10 changes: 5 additions & 5 deletions examples/models/sklearn_spacy_text/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
scipy>= 0.13.3
scikit-learn>=0.18
spacy==2.0.18
dill==0.2.9
seldon-core==0.2.7
seldon-core>=1.1
scipy>=1.0
scikit-learn>=0.20
spacy>=2.2
dill>=0.3
Loading

0 comments on commit 00c691d

Please sign in to comment.