diff --git a/CHANGELOG.md b/CHANGELOG.md index ad50985f1..45d10b2b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. ### Changed - Re-score ambiguous `DeterministicIntentParser` results based on slots [#791](https://github.com/snipsco/snips-nlu/pull/791) - Accept ambiguous results from `DeterministicIntentParser` when confidence score is above 0.5 [#797](https://github.com/snipsco/snips-nlu/pull/797) +- Moved the NLU random state from the config to the shared resources [#801](https://github.com/snipsco/snips-nlu/pull/801) +- Bumped `scikit-learn` to `>=0.21,<0.22` for `python>=3.5` and `>=0.20<0.21` for `python<3.5` [#801](https://github.com/snipsco/snips-nlu/pull/801) + +### Fixed +- Fixed a couple of bugs in the data augmentation which were making the NLU training non-deterministic [#801](https://github.com/snipsco/snips-nlu/pull/801) ## [0.19.6] ### Fixed diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 4351a4d3a..8a33a8c14 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -174,6 +174,26 @@ the dataset we generated earlier: engine.fit(dataset) +Note that, by default, training of the NLU engine is non-deterministic: +training and testing multiple times on the same data may produce different +outputs. + +Reproducible trainings can be achieved by passing a **random seed** to the +engine: + +.. code-block:: python + + seed = 42 + engine = SnipsNLUEngine(config=CONFIG_EN, random_state=seed) + engine.fit(dataset) + + +.. note:: + + Due to a ``scikit-learn`` bug fixed in version ``0.21`` we can't guarantee + any deterministic behavior if you're using a Python version ``<3.5`` since + ``scikit-learn>=0.21`` is only available starting from Python ``>=3.5`` + Parsing ------- diff --git a/setup.py b/setup.py index cd4a25cd8..61cdebf93 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,8 @@ "future>=0.16,<0.17", "numpy>=1.15,<1.16", "scipy>=1.0,<2.0", - "scikit-learn>=0.19,<0.20", + "scikit-learn>=0.21.1,<0.22; python_version>='3.5'", + "scikit-learn>=0.20,<0.21; python_version<'3.5'", "sklearn-crfsuite>=0.3.6,<0.4", "semantic_version>=2.6,<3.0", "snips-nlu-utils>=0.8,<0.9", @@ -29,10 +30,10 @@ "num2words>=0.5.6,<0.6", "plac>=0.9.6,<1.0", "requests>=2.0,<3.0", - "pathlib==1.0.1; python_version < '3.4'", + "pathlib==1.0.1; python_version<'3.4'", "pyaml>=17,<18", "deprecation>=2,<3", - "funcsigs>=1.0,<2.0; python_version < '3.4'" + "funcsigs>=1.0,<2.0; python_version<'3.4'" ] extras_require = { diff --git a/snips_nlu/constants.py b/snips_nlu/constants.py index f37e48ac4..1961a4b85 100644 --- a/snips_nlu/constants.py +++ b/snips_nlu/constants.py @@ -46,6 +46,7 @@ BUILTIN_ENTITY_PARSER = "builtin_entity_parser" CUSTOM_ENTITY_PARSER = "custom_entity_parser" MATCHING_STRICTNESS = "matching_strictness" +RANDOM_STATE = "random_state" # resources RESOURCES = "resources" diff --git a/snips_nlu/data_augmentation.py b/snips_nlu/data_augmentation.py index 68ff15d86..5bee6e509 100644 --- a/snips_nlu/data_augmentation.py +++ b/snips_nlu/data_augmentation.py @@ -69,10 +69,10 @@ def get_entities_iterators(intent_entities, language, add_builtin_entities_examples, random_state): entities_its = dict() for entity_name, entity in iteritems(intent_entities): - utterance_values = random_state.permutation(list(entity[UTTERANCES])) + utterance_values = random_state.permutation(sorted(entity[UTTERANCES])) if add_builtin_entities_examples and is_builtin_entity(entity_name): - entity_examples = get_builtin_entity_examples(entity_name, - language) + entity_examples = get_builtin_entity_examples( + entity_name, language) # Builtin entity examples must be kept first in the iterator to # ensure that they are used when augmenting data iterator_values = entity_examples + list(utterance_values) diff --git a/snips_nlu/default_configs/config_de.py b/snips_nlu/default_configs/config_de.py index 1644a9163..e104a3f2f 100644 --- a/snips_nlu/default_configs/config_de.py +++ b/snips_nlu/default_configs/config_de.py @@ -111,8 +111,7 @@ "min_utterances": 200, "capitalization_ratio": 0.2, "add_builtin_entities_examples": True - }, - "random_seed": None + } }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -140,8 +139,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/default_configs/config_en.py b/snips_nlu/default_configs/config_en.py index ac6a66d1e..9db80a4a0 100644 --- a/snips_nlu/default_configs/config_en.py +++ b/snips_nlu/default_configs/config_en.py @@ -97,8 +97,7 @@ "min_utterances": 200, "capitalization_ratio": 0.2, "add_builtin_entities_examples": True - }, - "random_seed": None + } }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -126,8 +125,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/default_configs/config_es.py b/snips_nlu/default_configs/config_es.py index df9c25f15..efd5cce34 100644 --- a/snips_nlu/default_configs/config_es.py +++ b/snips_nlu/default_configs/config_es.py @@ -90,7 +90,7 @@ "capitalization_ratio": 0.2, "add_builtin_entities_examples": True }, - "random_seed": None + }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -118,8 +118,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/default_configs/config_fr.py b/snips_nlu/default_configs/config_fr.py index df9c25f15..ddb48a3df 100644 --- a/snips_nlu/default_configs/config_fr.py +++ b/snips_nlu/default_configs/config_fr.py @@ -89,8 +89,7 @@ "min_utterances": 200, "capitalization_ratio": 0.2, "add_builtin_entities_examples": True - }, - "random_seed": None + } }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -118,8 +117,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/default_configs/config_it.py b/snips_nlu/default_configs/config_it.py index df9c25f15..ddb48a3df 100644 --- a/snips_nlu/default_configs/config_it.py +++ b/snips_nlu/default_configs/config_it.py @@ -89,8 +89,7 @@ "min_utterances": 200, "capitalization_ratio": 0.2, "add_builtin_entities_examples": True - }, - "random_seed": None + } }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -118,8 +117,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/default_configs/config_ja.py b/snips_nlu/default_configs/config_ja.py index f1cbf202f..d415961ab 100644 --- a/snips_nlu/default_configs/config_ja.py +++ b/snips_nlu/default_configs/config_ja.py @@ -116,7 +116,7 @@ "capitalization_ratio": 0.2, "add_builtin_entities_examples": True }, - "random_seed": None + }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -144,8 +144,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/default_configs/config_ko.py b/snips_nlu/default_configs/config_ko.py index fe4661c3f..ec21d6501 100644 --- a/snips_nlu/default_configs/config_ko.py +++ b/snips_nlu/default_configs/config_ko.py @@ -107,8 +107,7 @@ "min_utterances": 200, "capitalization_ratio": 0.2, "add_builtin_entities_examples": True - }, - "random_seed": None + } }, "intent_classifier_config": { "unit_name": "log_reg_intent_classifier", @@ -136,8 +135,7 @@ "unknown_words_replacement_string": None, "keep_order": True } - }, - "random_seed": None + } } } ] diff --git a/snips_nlu/intent_classifier/featurizer.py b/snips_nlu/intent_classifier/featurizer.py index b5458d0d8..a2785d445 100644 --- a/snips_nlu/intent_classifier/featurizer.py +++ b/snips_nlu/intent_classifier/featurizer.py @@ -109,7 +109,9 @@ def _fit_transform_tfidf_vectorizer(self, x, y, dataset): config=self.config.tfidf_vectorizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, - resources=self.resources) + resources=self.resources, + random_state=self.random_state, + ) x_tfidf = self.tfidf_vectorizer.fit_transform(x, dataset) if not self.tfidf_vectorizer.vocabulary: @@ -139,7 +141,9 @@ def _fit_cooccurrence_vectorizer(self, x, classes, none_class, dataset): config=self.config.cooccurrence_vectorizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, - resources=self.resources) + resources=self.resources, + random_state=self.random_state, + ) x_cooccurrence = self.cooccurrence_vectorizer.fit( non_null_x, dataset).transform(x) if not self.cooccurrence_vectorizer.word_pairs: diff --git a/snips_nlu/intent_classifier/log_reg_classifier.py b/snips_nlu/intent_classifier/log_reg_classifier.py index 5eaff50e8..b74ab899d 100644 --- a/snips_nlu/intent_classifier/log_reg_classifier.py +++ b/snips_nlu/intent_classifier/log_reg_classifier.py @@ -10,11 +10,10 @@ from snips_nlu.common.log_utils import DifferedLoggingMessage, log_elapsed_time from snips_nlu.common.utils import ( - check_persisted_path, check_random_state, - fitted_required, json_string) + check_persisted_path, fitted_required, json_string) from snips_nlu.constants import LANGUAGE, RES_PROBA from snips_nlu.dataset import validate_and_format_dataset -from snips_nlu.exceptions import _EmptyDatasetUtterancesError, LoadingError +from snips_nlu.exceptions import LoadingError, _EmptyDatasetUtterancesError from snips_nlu.intent_classifier.featurizer import Featurizer from snips_nlu.intent_classifier.intent_classifier import IntentClassifier from snips_nlu.intent_classifier.log_reg_classifier_utils import ( @@ -24,11 +23,20 @@ logger = logging.getLogger(__name__) +# We set tol to 1e-3 to silence the following warning with Python 2 ( +# scikit-learn 0.20): +# +# FutureWarning: max_iter and tol parameters have been added in SGDClassifier +# in 0.19. If max_iter is set but tol is left unset, the default value for tol +# in 0.19 and 0.20 will be None (which is equivalent to -infinity, so it has no +# effect) but will change in 0.21 to 1e-3. Specify tol to silence this warning. + LOG_REG_ARGS = { "loss": "log", "penalty": "l2", "class_weight": "balanced", - "max_iter": 5, + "max_iter": 1000, + "tol": 1e-3, "n_jobs": -1 } @@ -66,12 +74,11 @@ def fit(self, dataset): self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] - random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, - random_state) + self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: @@ -81,7 +88,8 @@ def fit(self, dataset): config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, - resources=self.resources + resources=self.resources, + random_state=self.random_state, ) self.featurizer.language = language @@ -94,8 +102,8 @@ def fit(self, dataset): return self alpha = get_regularization_factor(dataset) - self.classifier = SGDClassifier(random_state=random_state, - alpha=alpha, **LOG_REG_ARGS) + self.classifier = SGDClassifier( + random_state=self.random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self diff --git a/snips_nlu/intent_parser/probabilistic_intent_parser.py b/snips_nlu/intent_parser/probabilistic_intent_parser.py index 44196d554..4f2de9413 100644 --- a/snips_nlu/intent_parser/probabilistic_intent_parser.py +++ b/snips_nlu/intent_parser/probabilistic_intent_parser.py @@ -69,7 +69,9 @@ def fit(self, dataset, force_retrain=True): self.config.intent_classifier_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, - resources=self.resources) + resources=self.resources, + random_state=self.random_state, + ) if force_retrain or not self.intent_classifier.fitted: self.intent_classifier.fit(dataset) @@ -85,7 +87,9 @@ def fit(self, dataset, force_retrain=True): slot_filler_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, - resources=self.resources) + resources=self.resources, + random_state=self.random_state, + ) if force_retrain or not self.slot_fillers[intent_name].fitted: self.slot_fillers[intent_name].fit(dataset, intent_name) logger.debug("Fitted slot fillers in %s", diff --git a/snips_nlu/nlu_engine/nlu_engine.py b/snips_nlu/nlu_engine/nlu_engine.py index a46b72d5a..303609858 100644 --- a/snips_nlu/nlu_engine/nlu_engine.py +++ b/snips_nlu/nlu_engine/nlu_engine.py @@ -23,8 +23,9 @@ from snips_nlu.entity_parser import CustomEntityParser from snips_nlu.entity_parser.builtin_entity_parser import ( BuiltinEntityParser, is_builtin_entity) -from snips_nlu.exceptions import InvalidInputError, IntentNotFoundError, \ - LoadingError, IncompatibleModelError +from snips_nlu.exceptions import ( + InvalidInputError, IntentNotFoundError, LoadingError, + IncompatibleModelError) from snips_nlu.intent_parser import IntentParser from snips_nlu.pipeline.configs import NLUEngineConfig from snips_nlu.pipeline.processing_unit import ProcessingUnit @@ -117,7 +118,9 @@ def fit(self, dataset, force_retrain=True): parser_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, - resources=self.resources) + resources=self.resources, + random_state=self.random_state, + ) if force_retrain or not recycled_parser.fitted: recycled_parser.fit(dataset, force_retrain) diff --git a/snips_nlu/pipeline/configs/intent_classifier.py b/snips_nlu/pipeline/configs/intent_classifier.py index 58ff50ab6..5b40a0824 100644 --- a/snips_nlu/pipeline/configs/intent_classifier.py +++ b/snips_nlu/pipeline/configs/intent_classifier.py @@ -13,16 +13,13 @@ class LogRegIntentClassifierConfig(FromDict, ProcessingUnitConfig): """Configuration of a :class:`.LogRegIntentClassifier`""" # pylint: disable=line-too-long - def __init__(self, data_augmentation_config=None, featurizer_config=None, - random_seed=None): + def __init__(self, data_augmentation_config=None, featurizer_config=None): """ Args: data_augmentation_config (:class:`IntentClassifierDataAugmentationConfig`): Defines the strategy of the underlying data augmentation featurizer_config (:class:`FeaturizerConfig`): Configuration of the :class:`.Featurizer` used underneath - random_seed (int, optional): Allows to fix the seed ot have - reproducible trainings """ if data_augmentation_config is None: data_augmentation_config = IntentClassifierDataAugmentationConfig() @@ -32,7 +29,6 @@ def __init__(self, data_augmentation_config=None, featurizer_config=None, self.data_augmentation_config = data_augmentation_config self._featurizer_config = None self.featurizer_config = featurizer_config - self.random_seed = random_seed # pylint: enable=line-too-long @@ -83,8 +79,7 @@ def to_dict(self): "unit_name": self.unit_name, "data_augmentation_config": self.data_augmentation_config.to_dict(), - "featurizer_config": self.featurizer_config.to_dict(), - "random_seed": self.random_seed + "featurizer_config": self.featurizer_config.to_dict() } diff --git a/snips_nlu/pipeline/configs/nlu_engine.py b/snips_nlu/pipeline/configs/nlu_engine.py index 8b10797a4..94310cd8e 100644 --- a/snips_nlu/pipeline/configs/nlu_engine.py +++ b/snips_nlu/pipeline/configs/nlu_engine.py @@ -16,7 +16,7 @@ class NLUEngineConfig(FromDict, ProcessingUnitConfig): the order in which each parser will be called by the nlu engine. """ - def __init__(self, intent_parsers_configs=None): + def __init__(self, intent_parsers_configs=None, random_seed=None): from snips_nlu.intent_parser import IntentParser if intent_parsers_configs is None: @@ -29,6 +29,7 @@ def __init__(self, intent_parsers_configs=None): ] self.intent_parsers_configs = [ IntentParser.get_config(conf) for conf in intent_parsers_configs] + self.random_seed = random_seed @property def unit_name(self): diff --git a/snips_nlu/pipeline/configs/slot_filler.py b/snips_nlu/pipeline/configs/slot_filler.py index 411948a82..dfbecfc76 100644 --- a/snips_nlu/pipeline/configs/slot_filler.py +++ b/snips_nlu/pipeline/configs/slot_filler.py @@ -30,7 +30,7 @@ class CRFSlotFillerConfig(FromDict, ProcessingUnitConfig): def __init__(self, feature_factory_configs=None, tagging_scheme=None, crf_args=None, - data_augmentation_config=None, random_seed=None): + data_augmentation_config=None): if tagging_scheme is None: from snips_nlu.slot_filler.crf_utils import TaggingScheme tagging_scheme = TaggingScheme.BIO @@ -46,7 +46,6 @@ def __init__(self, feature_factory_configs=None, self.crf_args = crf_args self._data_augmentation_config = None self.data_augmentation_config = data_augmentation_config - self.random_seed = random_seed @property def tagging_scheme(self): @@ -102,8 +101,7 @@ def to_dict(self): "crf_args": self.crf_args, "tagging_scheme": self.tagging_scheme.value, "data_augmentation_config": - self.data_augmentation_config.to_dict(), - "random_seed": self.random_seed + self.data_augmentation_config.to_dict() } diff --git a/snips_nlu/pipeline/processing_unit.py b/snips_nlu/pipeline/processing_unit.py index 42b34e36d..72abffd63 100644 --- a/snips_nlu/pipeline/processing_unit.py +++ b/snips_nlu/pipeline/processing_unit.py @@ -13,10 +13,10 @@ from snips_nlu.common.io_utils import temp_dir, unzip_archive from snips_nlu.common.registrable import Registrable from snips_nlu.common.utils import ( - json_string) + json_string, check_random_state) from snips_nlu.constants import ( BUILTIN_ENTITY_PARSER, CUSTOM_ENTITY_PARSER, CUSTOM_ENTITY_PARSER_USAGE, - RESOURCES, LANGUAGE) + RESOURCES, LANGUAGE, RANDOM_STATE) from snips_nlu.entity_parser import ( BuiltinEntityParser, CustomEntityParser, CustomEntityParserUsage) from snips_nlu.exceptions import LoadingError @@ -49,6 +49,7 @@ def __init__(self, config, **shared): self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER) self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER) self.resources = shared.get(RESOURCES) + self.random_state = check_random_state(shared.get(RANDOM_STATE)) @classproperty def config_type(cls): # pylint:disable=no-self-argument diff --git a/snips_nlu/slot_filler/crf_slot_filler.py b/snips_nlu/slot_filler/crf_slot_filler.py index 123248ef7..2327a63b9 100644 --- a/snips_nlu/slot_filler/crf_slot_filler.py +++ b/snips_nlu/slot_filler/crf_slot_filler.py @@ -18,15 +18,14 @@ from snips_nlu.common.io_utils import mkdir_p from snips_nlu.common.log_utils import DifferedLoggingMessage, log_elapsed_time from snips_nlu.common.utils import ( - check_persisted_path, - check_random_state, fitted_required, json_string) -from snips_nlu.constants import ( - DATA, LANGUAGE) + check_persisted_path, fitted_required, json_string) +from snips_nlu.constants import DATA, LANGUAGE from snips_nlu.data_augmentation import augment_utterances from snips_nlu.dataset import validate_and_format_dataset from snips_nlu.exceptions import LoadingError from snips_nlu.pipeline.configs import CRFSlotFillerConfig from snips_nlu.preprocessing import tokenize + from snips_nlu.slot_filler.crf_utils import ( OUTSIDE, TAGS, TOKENS, tags_to_slots, utterance_to_sample) from snips_nlu.slot_filler.feature import TOKEN_NAME @@ -128,10 +127,9 @@ def fit(self, dataset, intent): # No need to train the CRF if the intent has no slots return self - random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, - resources=self.resources, random_state=random_state, + resources=self.resources, random_state=self.random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ @@ -201,12 +199,11 @@ def compute_features(self, tokens, drop_out=False): cache = [{TOKEN_NAME: token} for token in tokens] features = [] - random_state = check_random_state(self.config.random_seed) for i in range(len(tokens)): token_features = UnupdatableDict() for feature in self.features: f_drop_out = feature.drop_out - if drop_out and random_state.rand() < f_drop_out: + if drop_out and self.random_state.rand() < f_drop_out: continue value = feature.compute(i, cache) if value is not None: diff --git a/snips_nlu/slot_filler/feature_factory.py b/snips_nlu/slot_filler/feature_factory.py index 599c19f20..90c68a0a2 100644 --- a/snips_nlu/slot_filler/feature_factory.py +++ b/snips_nlu/slot_filler/feature_factory.py @@ -9,15 +9,16 @@ from snips_nlu.common.abc_utils import classproperty from snips_nlu.common.registrable import Registrable +from snips_nlu.common.utils import check_random_state from snips_nlu.constants import ( CUSTOM_ENTITY_PARSER_USAGE, END, GAZETTEERS, LANGUAGE, RES_MATCH_RANGE, START, STEMS, WORD_CLUSTERS, CUSTOM_ENTITY_PARSER, BUILTIN_ENTITY_PARSER, - RESOURCES) + RESOURCES, RANDOM_STATE) from snips_nlu.dataset import ( extract_intent_entities, get_dataset_gazetteer_entities) from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity -from snips_nlu.entity_parser.custom_entity_parser import \ - CustomEntityParserUsage +from snips_nlu.entity_parser.custom_entity_parser import ( + CustomEntityParserUsage) from snips_nlu.languages import get_default_sep from snips_nlu.preprocessing import Token, normalize_token, stem_token from snips_nlu.resources import get_gazetteer, get_word_cluster @@ -47,6 +48,7 @@ def __init__(self, factory_config, **shared): self.resources = shared.get(RESOURCES) self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER) self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER) + self.random_state = check_random_state(shared.get(RANDOM_STATE)) @classmethod def from_config(cls, factory_config, **shared): diff --git a/snips_nlu/tests/integration_test.py b/snips_nlu/tests/integration_test.py index ba93a374a..38059d115 100644 --- a/snips_nlu/tests/integration_test.py +++ b/snips_nlu/tests/integration_test.py @@ -1,6 +1,9 @@ # coding=utf-8 from __future__ import print_function, unicode_literals +import json +from builtins import range, str + from future.utils import iteritems from snips_nlu_metrics import compute_cross_val_metrics @@ -48,6 +51,33 @@ def check_metrics(self, results): "Slot f1 score is too low (%.3f) for slot '%s' of intent " "'%s'" % (slot_f1, slot_name, intent_name)) + def test_nlu_engine_training_is_deterministic(self): + # We can't write a test to ensure the NLU training is always the same + # instead we train the NLU 10 times and check the learnt parameters. + # It does not bring any guarantee but it might alert us once in a while + + # Given + num_runs = 10 + random_state = 42 + + with PERFORMANCE_DATASET_PATH.open("r") as f: + dataset = json.load(f) + + ref_log_reg, ref_crfs = None, None + for _ in range(num_runs): + # When + engine = TrainingEngine(random_state=random_state).fit(dataset) + log_reg = _extract_log_reg(engine) + crfs = _extract_crfs(engine) + + if ref_log_reg is None: + ref_log_reg = log_reg + ref_crfs = crfs + else: + # Then + self.assertDictEqual(ref_log_reg, log_reg) + self.assertDictEqual(ref_crfs, crfs) + def _slot_matching_lambda(lhs_slot, rhs_slot): lhs_value = lhs_slot["text"] @@ -63,3 +93,24 @@ def _slot_matching_lambda(lhs_slot, rhs_slot): if rhs_tokens and rhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES: rhs_tokens = rhs_tokens[1:] return lhs_tokens == rhs_tokens + + +def _extract_log_reg(engine): + log_reg = dict() + intent_classifier = engine.intent_parsers[1].intent_classifier + log_reg["intent_list"] = intent_classifier.intent_list + log_reg["coef"] = intent_classifier.classifier.coef_.tolist() + log_reg["intercept"] = intent_classifier.classifier.intercept_.tolist() + log_reg["t_"] = intent_classifier.classifier.t_ + return log_reg + + +def _extract_crfs(engine): + crfs = dict() + slot_fillers = engine.intent_parsers[1].slot_fillers + for intent, slot_filler in iteritems(slot_fillers): + crfs[intent] = { + "state_features": slot_filler.crf_model.state_features_, + "transition_features": slot_filler.crf_model.transition_features_ + } + return crfs diff --git a/snips_nlu/tests/test_config.py b/snips_nlu/tests/test_config.py index 47b3b53a5..dfed74289 100644 --- a/snips_nlu/tests/test_config.py +++ b/snips_nlu/tests/test_config.py @@ -112,8 +112,7 @@ def test_intent_classifier_config(self): "unit_name": LogRegIntentClassifier.unit_name, "data_augmentation_config": IntentClassifierDataAugmentationConfig().to_dict(), - "featurizer_config": FeaturizerConfig().to_dict(), - "random_seed": 42 + "featurizer_config": FeaturizerConfig().to_dict() } # When @@ -151,8 +150,7 @@ def test_crf_slot_filler_config(self): "algorithm": "lbfgs" }, "data_augmentation_config": - SlotFillerDataAugmentationConfig().to_dict(), - "random_seed": 43 + SlotFillerDataAugmentationConfig().to_dict() } # When diff --git a/snips_nlu/tests/test_crf_slot_filler.py b/snips_nlu/tests/test_crf_slot_filler.py index 033ddda96..a9ec87e83 100644 --- a/snips_nlu/tests/test_crf_slot_filler.py +++ b/snips_nlu/tests/test_crf_slot_filler.py @@ -9,7 +9,7 @@ from sklearn_crfsuite import CRF from snips_nlu.constants import ( - DATA, END, ENTITY, LANGUAGE_EN, SLOT_NAME, START, TEXT) + DATA, END, ENTITY, LANGUAGE_EN, SLOT_NAME, START, TEXT, RANDOM_STATE) from snips_nlu.dataset import Dataset from snips_nlu.entity_parser import CustomEntityParserUsage from snips_nlu.exceptions import NotTrained @@ -35,9 +35,9 @@ def test_should_get_slots(self): - make me [number_of_cups:snips/number](five) cups of tea - please I want [number_of_cups](two) cups of tea""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = CRFSlotFillerConfig(random_seed=42) shared = self.get_shared_data(dataset) - slot_filler = CRFSlotFiller(config, **shared) + shared[RANDOM_STATE] = 42 + slot_filler = CRFSlotFiller(**shared) intent = "MakeTea" slot_filler.fit(dataset, intent) @@ -65,9 +65,10 @@ def test_should_get_builtin_slots(self): - Can you tell me the weather [datetime] please ? - what is the weather forecast [datetime] in [location](paris)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = CRFSlotFillerConfig(random_seed=42) intent = "GetWeather" - slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset)) + shared = self.get_shared_data(dataset) + shared[RANDOM_STATE] = 42 + slot_filler = CRFSlotFiller(**shared) slot_filler.fit(dataset, intent) # When @@ -101,10 +102,10 @@ def test_should_get_sub_builtin_slots(self): - find an activity from [start](6pm) to [end](8pm) - Book me a trip from [start](this friday) to [end](next tuesday)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = CRFSlotFillerConfig(random_seed=42) intent = "PlanBreak" - slot_filler = CRFSlotFiller(config, - **self.get_shared_data(dataset)) + shared = self.get_shared_data(dataset) + shared[RANDOM_STATE] = 42 + slot_filler = CRFSlotFiller(**shared) slot_filler.fit(dataset, intent) # When @@ -356,10 +357,10 @@ def test_should_get_slots_after_deserialization(self): - i want [number_of_cups] cups of tea please - can you prepare [number_of_cups] cups of tea ?""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = CRFSlotFillerConfig(random_seed=42) intent = "MakeTea" shared = self.get_shared_data(dataset) - slot_filler = CRFSlotFiller(config, **shared) + shared[RANDOM_STATE] = 42 + slot_filler = CRFSlotFiller(**shared) slot_filler.fit(dataset, intent) slot_filler.persist(self.tmp_file_path) @@ -740,7 +741,7 @@ def test_should_compute_features(self): }, ] slot_filler_config = CRFSlotFillerConfig( - feature_factory_configs=features_factories, random_seed=40) + feature_factory_configs=features_factories) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset_stream = io.StringIO(""" @@ -761,11 +762,12 @@ def test_should_compute_features(self): # Then expected_features = [ - {"ngram_1": "foo"}, {}, + {"ngram_1": "hello"}, {"ngram_1": "world"}, - {}, + {"ngram_1": "bar"} ] + self.assertListEqual(expected_features, features_with_drop_out) def test_should_fit_and_parse_empty_intent(self): diff --git a/snips_nlu/tests/test_log_reg_intent_classifier.py b/snips_nlu/tests/test_log_reg_intent_classifier.py index 946e678d1..334a3d74d 100644 --- a/snips_nlu/tests/test_log_reg_intent_classifier.py +++ b/snips_nlu/tests/test_log_reg_intent_classifier.py @@ -50,8 +50,7 @@ def test_should_get_intent(self): - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = LogRegIntentClassifierConfig(random_seed=42) - classifier = LogRegIntentClassifier(config).fit(dataset) + classifier = LogRegIntentClassifier(random_state=42).fit(dataset) text = "hey how are you doing ?" # When @@ -108,8 +107,7 @@ def test_should_get_intent_when_filter(self): - brew two cups of coffee - can you prepare one cup of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = LogRegIntentClassifierConfig(random_seed=42) - classifier = LogRegIntentClassifier(config).fit(dataset) + classifier = LogRegIntentClassifier(random_state=42).fit(dataset) # When text1 = "Make me two cups of tea" @@ -169,8 +167,7 @@ def test_should_get_intents(self): utterances: - yili yulu yele""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - config = LogRegIntentClassifierConfig(random_seed=42) - classifier = LogRegIntentClassifier(config).fit(dataset) + classifier = LogRegIntentClassifier(random_state=42).fit(dataset) text = "yala yili yulu" # When @@ -239,9 +236,11 @@ def test_should_be_serializable(self): utterances: - lorem ipsum""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - intent_classifier = LogRegIntentClassifier().fit(dataset) + intent_classifier = LogRegIntentClassifier( + random_state=42).fit(dataset) coeffs = intent_classifier.classifier.coef_.tolist() intercept = intent_classifier.classifier.intercept_.tolist() + t_ = intent_classifier.classifier.t_ # When intent_classifier.persist(self.tmp_file_path) @@ -252,7 +251,7 @@ def test_should_be_serializable(self): "config": LogRegIntentClassifierConfig().to_dict(), "coeffs": coeffs, "intercept": intercept, - "t_": 701.0, + "t_": t_, "intent_list": intent_list, "featurizer": "featurizer" } diff --git a/snips_nlu/tests/test_probabilistic_intent_parser.py b/snips_nlu/tests/test_probabilistic_intent_parser.py index eb87fd6b8..d932eefaf 100644 --- a/snips_nlu/tests/test_probabilistic_intent_parser.py +++ b/snips_nlu/tests/test_probabilistic_intent_parser.py @@ -6,7 +6,8 @@ from mock import patch from snips_nlu.constants import ( - RES_ENTITY, RES_INTENT, RES_INTENT_NAME, RES_SLOTS, RES_VALUE) + RES_ENTITY, RES_INTENT, RES_INTENT_NAME, RES_SLOTS, RES_VALUE, + RANDOM_STATE) from snips_nlu.dataset import Dataset from snips_nlu.exceptions import IntentNotFoundError, NotTrained from snips_nlu.intent_classifier import ( @@ -42,11 +43,9 @@ def test_should_parse(self): utterances: - foz for [slot3:entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - classifier_config = LogRegIntentClassifierConfig(random_seed=42) - slot_filler_config = CRFSlotFillerConfig(random_seed=42) - parser_config = ProbabilisticIntentParserConfig( - classifier_config, slot_filler_config) - parser = ProbabilisticIntentParser(parser_config) + shared = self.get_shared_data(dataset) + shared[RANDOM_STATE] = 42 + parser = ProbabilisticIntentParser(**shared) parser.fit(dataset) text = "foo bar baz" @@ -81,11 +80,9 @@ def test_should_parse_with_filter(self): utterances: - foz for [slot3:entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - classifier_config = LogRegIntentClassifierConfig(random_seed=42) - slot_filler_config = CRFSlotFillerConfig(random_seed=42) - parser_config = ProbabilisticIntentParserConfig( - classifier_config, slot_filler_config) - parser = ProbabilisticIntentParser(parser_config) + shared = self.get_shared_data(dataset) + shared[RANDOM_STATE] = 42 + parser = ProbabilisticIntentParser(**shared) parser.fit(dataset) text = "foo bar baz" @@ -121,11 +118,9 @@ def test_should_parse_top_intents(self): utterances: - foz for [entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - classifier_config = LogRegIntentClassifierConfig(random_seed=42) - slot_filler_config = CRFSlotFillerConfig(random_seed=42) - parser_config = ProbabilisticIntentParserConfig( - classifier_config, slot_filler_config) - parser = ProbabilisticIntentParser(parser_config) + shared = self.get_shared_data(dataset) + shared[RANDOM_STATE] = 42 + parser = ProbabilisticIntentParser(**shared) parser.fit(dataset) text = "foo bar baz" @@ -162,9 +157,9 @@ def test_should_get_intents(self): utterances: - yili yulu yele""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - classifier_config = LogRegIntentClassifierConfig(random_seed=42) - parser_config = ProbabilisticIntentParserConfig(classifier_config) - parser = ProbabilisticIntentParser(parser_config).fit(dataset) + shared = self.get_shared_data(dataset) + shared[RANDOM_STATE] = 42 + parser = ProbabilisticIntentParser(**shared).fit(dataset) text = "yala yili yulu" # When @@ -678,15 +673,10 @@ def test_fitting_should_be_reproducible_after_serialization(self): - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json - seed1 = 666 - seed2 = 42 - config = ProbabilisticIntentParserConfig( - intent_classifier_config=LogRegIntentClassifierConfig( - random_seed=seed1), - slot_filler_config=CRFSlotFillerConfig(random_seed=seed2) - ) + seed = 666 shared = self.get_shared_data(dataset) - parser = ProbabilisticIntentParser(config, **shared) + shared[RANDOM_STATE] = seed + parser = ProbabilisticIntentParser(**shared) parser.persist(self.tmp_file_path) # When diff --git a/snips_nlu/tests/test_processing_unit.py b/snips_nlu/tests/test_processing_unit.py new file mode 100644 index 000000000..9888b72e7 --- /dev/null +++ b/snips_nlu/tests/test_processing_unit.py @@ -0,0 +1,35 @@ +from snips_nlu.pipeline.processing_unit import ProcessingUnit +from snips_nlu.tests.utils import FixtureTest + + +class DummyProcessingUnit(ProcessingUnit): + unit_name = "dummy_processing_unit" + + def persist(self, path): + pass + + @classmethod + def from_path(cls, path, **shared): + return cls(config=None, **shared) + + @property + def fitted(self): + return True + + +class TestProcessingUnit(FixtureTest): + + def test_from_path_with_seed(self): + # Given + max_int = 1e6 + seed = 1 + + # When + unit_0 = DummyProcessingUnit.from_path(None, random_state=seed) + int_0 = unit_0.random_state.randint(max_int) + + unit_1 = DummyProcessingUnit.from_path(None, random_state=seed) + int_1 = unit_1.random_state.randint(max_int) + + # Then + self.assertEqual(int_0, int_1) diff --git a/snips_nlu/tests/utils.py b/snips_nlu/tests/utils.py index e2b41a94e..8f1e6f04c 100644 --- a/snips_nlu/tests/utils.py +++ b/snips_nlu/tests/utils.py @@ -47,7 +47,8 @@ def get_shared_data(cls, dataset, parser_usage=None): return { "resources": resources, "builtin_entity_parser": builtin_entity_parser, - "custom_entity_parser": custom_entity_parser + "custom_entity_parser": custom_entity_parser, + "random_state": 1 } @contextmanager