Merge branch 'master' into cli

yaput · Mar 4, 2019 · c0fe92d · c0fe92d
2 parents 7604f32 + 351fa3d
commit c0fe92d
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 27 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,7 @@ Added
 -----
 - Added a detailed warning showing which entities are overlapping
 - Authentication token can be also set with env variable `RASA_NLU_TOKEN`.
+- `SpacyEntityExtractor` supports same entity filtering as `DucklingHTTPExtractor`
 
 Changed
 -------

diff --git a/docs/components.rst b/docs/components.rst
@@ -536,9 +536,24 @@ SpacyEntityExtractor
     As of now, this component can only use the spacy builtin entity extraction models and can not be retrained.
     This extractor does not provide any confidence scores.
 
+:Configuration:
+    Configure which dimensions, i.e. entity types, the spacy component
+    should extract. A full list of available dimensions can be found in
+    the `spacy documentation <https://spacy.io/api/annotation#section-named-entities>`_.
+    Leaving the dimensions option unspecified will extract all available dimensions.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "SpacyEntityExtractor"
+          # dimensions to extract
+          dimensions: ["PERSON", "LOC", "ORG", "PRODUCT"]
+
+
 EntitySynonymMapper
 ~~~~~~~~~~~~~~~~~~~
 
+
 :Short: Maps synonymous entity values to the same value.
 :Outputs: modifies existing entities that previous entity extraction components found
 
@@ -672,8 +687,9 @@ DucklingHTTPExtractor
 
 :Configuration:
     Configure which dimensions, i.e. entity types, the duckling component
-    to extract. A full list of available dimensions can be found in
+    should extract. A full list of available dimensions can be found in
     the `duckling documentation <https://duckling.wit.ai/>`_.
+    Leaving the dimensions option unspecified will extract all available dimensions.
 
     .. code-block:: yaml
 

diff --git a/docs/entities.rst b/docs/entities.rst
@@ -7,14 +7,14 @@ Entity Extraction
 =================
 
 
-=======================  ================  ========================    ===================================
-Component                Requires          Model           	           notes
-=======================  ================  ========================    ===================================
-``CRFEntityExtractor``              sklearn-crfsuite  conditional random field    good for training custom entities
-``SpacyEntityExtractor``            spaCy             averaged perceptron         provides pre-trained entities
-``DucklingHTTPExtractor``    running duckling  context-free grammar        provides pre-trained entities
-``MitieEntityExtractor``            MITIE             structured SVM              good for training custom entities
-=======================  ================  ========================    ===================================
+=========================  ================  ========================  =================================
+Component                  Requires          Model           	       Notes
+=========================  ================  ========================  =================================
+``CRFEntityExtractor``     sklearn-crfsuite  conditional random field  good for training custom entities
+``SpacyEntityExtractor``   spaCy             averaged perceptron       provides pre-trained entities
+``DucklingHTTPExtractor``  running duckling  context-free grammar      provides pre-trained entities
+``MitieEntityExtractor``   MITIE             structured SVM            good for training custom entities
+=========================  ================  ========================  =================================
 
 
 Custom Entities

diff --git a/docs/persist.rst b/docs/persist.rst
@@ -55,7 +55,13 @@ Rasa NLU supports using `S3 <https://aws.amazon.com/s3/>`_ and
 
     If there is no container with the name ``AZURE_CONTAINER`` Rasa will create it.
 
-Models are gzipped before saving to cloud.
+Models are gzipped before they are saved in the cloud. The gzipped file naming convention
+is `{PROJECT}___{MODEL_NAME}.tar.gz` and it is stored in the root folder of the storage service.
+Currently, you are not able to manually specify the path on the cloud storage.
+
+If storing trained models, Rasa NLU will gzip the new model and upload it to the container. If retrieving/loading models
+from the cloud storage, Rasa NLU will download the gzipped model locally and extract the contents to the location
+specified by the `--path` flag.
 
 
 .. include:: feedback.inc
diff --git a/rasa_nlu/extractors/__init__.py b/rasa_nlu/extractors/__init__.py
@@ -22,6 +22,17 @@ def add_processor_name(self,
 
         return entity
 
+    @staticmethod
+    def filter_irrelevant_entities(extracted, requested_dimensions):
+        """Only return dimensions the user configured"""
+
+        if requested_dimensions:
+            return [entity
+                    for entity in extracted
+                    if entity["entity"] in requested_dimensions]
+        else:
+            return extracted
+
     @staticmethod
     def find_entity(ent, text, tokens):
         offsets = [token.offset for token in tokens]

diff --git a/rasa_nlu/extractors/duckling_http_extractor.py b/rasa_nlu/extractors/duckling_http_extractor.py
@@ -24,17 +24,6 @@ def extract_value(match):
     return value
 
 
-def filter_irrelevant_matches(matches, requested_dimensions):
-    """Only return dimensions the user configured"""
-
-    if requested_dimensions:
-        return [match
-                for match in matches
-                if match["dim"] in requested_dimensions]
-    else:
-        return matches
-
-
 def convert_duckling_format_to_rasa(matches):
     extracted = []
 
@@ -158,9 +147,10 @@ def process(self, message: Message, **kwargs: Any) -> None:
         if self._url() is not None:
             reference_time = self._reference_time_from_message(message)
             matches = self._duckling_parse(message.text, reference_time)
+            all_extracted = convert_duckling_format_to_rasa(matches)
             dimensions = self.component_config["dimensions"]
-            relevant_matches = filter_irrelevant_matches(matches, dimensions)
-            extracted = convert_duckling_format_to_rasa(relevant_matches)
+            extracted = DucklingHTTPExtractor.filter_irrelevant_entities(
+                all_extracted, dimensions)
         else:
             extracted = []
             logger.warning("Duckling HTTP component in pipeline, but no "

diff --git a/rasa_nlu/extractors/spacy_entity_extractor.py b/rasa_nlu/extractors/spacy_entity_extractor.py
@@ -14,12 +14,25 @@ class SpacyEntityExtractor(EntityExtractor):
 
     requires = ["spacy_nlp"]
 
+    defaults = {
+        # by default all dimensions recognized by spacy are returned
+        # dimensions can be configured to contain an array of strings
+        # with the names of the dimensions to filter for
+        "dimensions": None,
+    }
+
+    def __init__(self, component_config: Text = None) -> None:
+        super(SpacyEntityExtractor, self).__init__(component_config)
+
     def process(self, message: Message, **kwargs: Any) -> None:
         # can't use the existing doc here (spacy_doc on the message)
         # because tokens are lower cased which is bad for NER
         spacy_nlp = kwargs.get("spacy_nlp", None)
         doc = spacy_nlp(message.text)
-        extracted = self.add_extractor_name(self.extract_entities(doc))
+        all_extracted = self.add_extractor_name(self.extract_entities(doc))
+        dimensions = self.component_config["dimensions"]
+        extracted = SpacyEntityExtractor.filter_irrelevant_entities(
+            all_extracted, dimensions)
         message.set("entities",
                     message.get("entities", []) + extracted,
                     add_to_output=True)

diff --git a/tests/base/test_extractors.py b/tests/base/test_extractors.py
@@ -189,6 +189,33 @@ def test_duckling_entity_extractor(component_builder):
     assert entities[0]["text"] == "tomorrow"
     assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
 
+    # Test dimension filtering includes only specified dimensions
+    _config = RasaNLUModelConfig(
+        {"pipeline": [{"name": "DucklingHTTPExtractor"}]}
+    )
+    _config.set_component_attr(0, dimensions=["number"],
+                               url="http://localhost:8000")
+    ducklingNumber = component_builder.create_component(
+        _config.for_component(0),
+        _config)
+    httpretty.register_uri(
+        httpretty.POST,
+        "http://localhost:8000/parse",
+        body="""[{"body":"Yesterday","start":0,"value":{"values":[{
+            "value":"2019-02-28T00:00:00.000+01:00","grain":"day",
+            "type":"value"}],"value":"2019-02-28T00:00:00.000+01:00",
+            "grain":"day","type":"value"},"end":9,"dim":"time"},
+            {"body":"5","start":21,"value":{"value":5,"type":"value"},
+            "end":22,"dim":"number"}]"""
+    )
+
+    message = Message("Yesterday there were 5 people in a room")
+    ducklingNumber.process(message)
+    entities = message.get("entities")
+    assert len(entities) == 1
+    assert entities[0]["text"] == "5"
+    assert entities[0]["value"] == 5
+
 
 def test_duckling_entity_extractor_and_synonyms(component_builder):
     _config = RasaNLUModelConfig(
@@ -234,13 +261,14 @@ def test_unintentional_synonyms_capitalized(component_builder):
     assert ner_syn.synonyms.get("tacos") == "Mexican"
 
 
-def test_spacy_ner_extractor(spacy_nlp):
-    ext = SpacyEntityExtractor()
+def test_spacy_ner_extractor(component_builder, spacy_nlp):
+    _config = RasaNLUModelConfig({"pipeline":
+                                 [{"name": "SpacyEntityExtractor"}]})
+    ext = component_builder.create_component(_config.for_component(0), _config)
     example = Message("anywhere in the West", {
         "intent": "restaurant_search",
         "entities": [],
         "spacy_doc": spacy_nlp("anywhere in the west")})
-
     ext.process(example, spacy_nlp=spacy_nlp)
 
     assert len(example.get("entities", [])) == 1
@@ -251,3 +279,24 @@ def test_spacy_ner_extractor(spacy_nlp):
         'value': 'West',
         'entity': 'LOC',
         'confidence': None}
+
+    # Test dimension filtering includes only specified dimensions
+
+    example = Message("anywhere in the West with Sebastian Thrun", {
+        "intent": "example_intent",
+        "entities": [],
+        "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun")})
+    _config = RasaNLUModelConfig({"pipeline":
+                                 [{"name": "SpacyEntityExtractor"}]})
+    _config.set_component_attr(0, dimensions=["PERSON"])
+    ext = component_builder.create_component(_config.for_component(0), _config)
+    ext.process(example, spacy_nlp=spacy_nlp)
+
+    assert len(example.get("entities", [])) == 1
+    assert example.get("entities")[0] == {
+        'start': 26,
+        'extractor': 'SpacyEntityExtractor',
+        'end': 41,
+        'value': 'Sebastian Thrun',
+        'entity': 'PERSON',
+        'confidence': None}
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ Added @@
     -----
     - Added a detailed warning showing which entities are overlapping
     - Authentication token can be also set with env variable `RASA_NLU_TOKEN`.
+    - `SpacyEntityExtractor` supports same entity filtering as `DucklingHTTPExtractor`
     Changed
     -------
@@ Expand Down @@