Fixes after review

snipsco · May 20, 2019 · 45f4fd4 · 45f4fd4
1 parent c3383a6
commit 45f4fd4
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 ### Changed
 - Re-score ambiguous `DeterministicIntentParser` results based on slots [#791](https://github.com/snipsco/snips-nlu/pull/791)
 - Accept ambiguous results from `DeterministicIntentParser` when confidence score is above 0.5 [#797](https://github.com/snipsco/snips-nlu/pull/797)
+- Moved the NLU random state from the config to the shared resources [#801](https://github.com/snipsco/snips-nlu/pull/801)
+- Bumped `scikit-learn` to `>=0.21,<0.22` for `python>=3.5` and `>=0.20<0.21` for `python<3.5` [#801](https://github.com/snipsco/snips-nlu/pull/801) 
+
+### Fixed
+- Fixed a couple of bugs in the data augmentation which were making the NLU training non-deterministic [#801](https://github.com/snipsco/snips-nlu/pull/801)
 
 ## [0.19.6]
 ### Fixed

diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -174,12 +174,12 @@ the dataset we generated earlier:
 
     engine.fit(dataset)
 
-Note that by default, the training of the engine is non-deterministic: if you
-train your NLU twice on the same data and test it on the same input, you'll get
-different outputs.
+Note that, by default, training of the NLU engine is non-deterministic:
+training and testing multiple times on the same data may produce different
+outputs.
 
-If you want to run training in a reproducible way you can pass a random seed to
-your engine:
+Reproducible trainings can be achieved by passing a **random seed** to the
+engine:
 
 .. code-block:: python
 

diff --git a/snips_nlu/intent_classifier/log_reg_classifier_utils.py b/snips_nlu/intent_classifier/log_reg_classifier_utils.py
@@ -43,10 +43,8 @@ def get_regularization_factor(dataset):
 
 def get_noise_it(noise, mean_length, std_length, random_state):
     it = itertools.cycle(noise)
-    i = 0
     while True:
         noise_length = int(random_state.normal(mean_length, std_length))
-        i += 1
         # pylint: disable=stop-iteration-return
         yield " ".join(next(it) for _ in range(noise_length))
         # pylint: enable=stop-iteration-return

diff --git a/snips_nlu/tests/test_crf_slot_filler.py b/snips_nlu/tests/test_crf_slot_filler.py
@@ -35,10 +35,9 @@ def test_should_get_slots(self):
 - make me [number_of_cups:snips/number](five) cups of tea
 - please I want [number_of_cups](two) cups of tea""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = CRFSlotFillerConfig()
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        slot_filler = CRFSlotFiller(config, **shared)
+        slot_filler = CRFSlotFiller(**shared)
         intent = "MakeTea"
         slot_filler.fit(dataset, intent)
 
@@ -66,11 +65,10 @@ def test_should_get_builtin_slots(self):
 - Can you tell me the weather [datetime] please ?
 - what is the weather forecast [datetime] in [location](paris)""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = CRFSlotFillerConfig()
         intent = "GetWeather"
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        slot_filler = CRFSlotFiller(config, **shared)
+        slot_filler = CRFSlotFiller(**shared)
         slot_filler.fit(dataset, intent)
 
         # When
@@ -104,11 +102,10 @@ def test_should_get_sub_builtin_slots(self):
 - find an activity from [start](6pm) to [end](8pm)
 - Book me a trip from [start](this friday) to [end](next tuesday)""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = CRFSlotFillerConfig()
         intent = "PlanBreak"
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        slot_filler = CRFSlotFiller(config, **shared)
+        slot_filler = CRFSlotFiller(**shared)
         slot_filler.fit(dataset, intent)
 
         # When
@@ -360,11 +357,10 @@ def test_should_get_slots_after_deserialization(self):
 - i want [number_of_cups] cups of tea please
 - can you prepare [number_of_cups] cups of tea ?""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = CRFSlotFillerConfig()
         intent = "MakeTea"
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        slot_filler = CRFSlotFiller(config, **shared)
+        slot_filler = CRFSlotFiller(**shared)
         slot_filler.fit(dataset, intent)
         slot_filler.persist(self.tmp_file_path)
 

diff --git a/snips_nlu/tests/test_log_reg_intent_classifier.py b/snips_nlu/tests/test_log_reg_intent_classifier.py
@@ -50,9 +50,7 @@ def test_should_get_intent(self):
 - does it rain
 - will it rain tomorrow""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = LogRegIntentClassifierConfig()
-        classifier = LogRegIntentClassifier(
-            config, random_state=42).fit(dataset)
+        classifier = LogRegIntentClassifier(random_state=42).fit(dataset)
         text = "hey how are you doing ?"
 
         # When
@@ -109,9 +107,7 @@ def test_should_get_intent_when_filter(self):
 - brew two cups of coffee
 - can you prepare one cup of coffee""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = LogRegIntentClassifierConfig()
-        classifier = LogRegIntentClassifier(
-            config, random_state=42).fit(dataset)
+        classifier = LogRegIntentClassifier(random_state=42).fit(dataset)
 
         # When
         text1 = "Make me two cups of tea"
@@ -171,9 +167,7 @@ def test_should_get_intents(self):
 utterances:
   - yili yulu yele""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        config = LogRegIntentClassifierConfig()
-        classifier = LogRegIntentClassifier(
-            config, random_state=42).fit(dataset)
+        classifier = LogRegIntentClassifier(random_state=42).fit(dataset)
         text = "yala yili yulu"
 
         # When

diff --git a/snips_nlu/tests/test_probabilistic_intent_parser.py b/snips_nlu/tests/test_probabilistic_intent_parser.py
@@ -43,13 +43,9 @@ def test_should_parse(self):
 utterances:
   - foz for [slot3:entity3](baz)""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        classifier_config = LogRegIntentClassifierConfig()
-        slot_filler_config = CRFSlotFillerConfig()
-        parser_config = ProbabilisticIntentParserConfig(
-            classifier_config, slot_filler_config)
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        parser = ProbabilisticIntentParser(parser_config, **shared)
+        parser = ProbabilisticIntentParser(**shared)
         parser.fit(dataset)
         text = "foo bar baz"
 
@@ -84,13 +80,9 @@ def test_should_parse_with_filter(self):
 utterances:
   - foz for [slot3:entity3](baz)""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        classifier_config = LogRegIntentClassifierConfig()
-        slot_filler_config = CRFSlotFillerConfig()
-        parser_config = ProbabilisticIntentParserConfig(
-            classifier_config, slot_filler_config)
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        parser = ProbabilisticIntentParser(parser_config, **shared)
+        parser = ProbabilisticIntentParser(**shared)
         parser.fit(dataset)
         text = "foo bar baz"
 
@@ -126,13 +118,9 @@ def test_should_parse_top_intents(self):
 utterances:
   - foz for [entity3](baz)""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        classifier_config = LogRegIntentClassifierConfig()
-        slot_filler_config = CRFSlotFillerConfig()
-        parser_config = ProbabilisticIntentParserConfig(
-            classifier_config, slot_filler_config)
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        parser = ProbabilisticIntentParser(parser_config, **shared)
+        parser = ProbabilisticIntentParser(**shared)
         parser.fit(dataset)
         text = "foo bar baz"
 
@@ -169,12 +157,9 @@ def test_should_get_intents(self):
 utterances:
   - yili yulu yele""")
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
-        classifier_config = LogRegIntentClassifierConfig()
-        parser_config = ProbabilisticIntentParserConfig(classifier_config)
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = 42
-        parser = ProbabilisticIntentParser(
-            parser_config, **shared).fit(dataset)
+        parser = ProbabilisticIntentParser(**shared).fit(dataset)
         text = "yala yili yulu"
 
         # When
@@ -689,13 +674,9 @@ def test_fitting_should_be_reproducible_after_serialization(self):
         dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
 
         seed = 666
-        config = ProbabilisticIntentParserConfig(
-            intent_classifier_config=LogRegIntentClassifierConfig(),
-            slot_filler_config=CRFSlotFillerConfig()
-        )
         shared = self.get_shared_data(dataset)
         shared[RANDOM_STATE] = seed
-        parser = ProbabilisticIntentParser(config, **shared)
+        parser = ProbabilisticIntentParser(**shared)
         parser.persist(self.tmp_file_path)
 
         # When