From 9aff42610999536c54317093400805637c25f248 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Mon, 15 Nov 2021 16:35:31 +0100 Subject: [PATCH] Removes hyperparameter features --- flair/hyperparameter/__init__.py | 11 - flair/hyperparameter/param_selection.py | 277 ------------------ flair/hyperparameter/parameter.py | 66 ----- .../docs/TUTORIAL_8_MODEL_OPTIMIZATION.md | 166 ----------- tests/test_hyperparameter.py | 92 ------ 5 files changed, 612 deletions(-) delete mode 100644 flair/hyperparameter/__init__.py delete mode 100644 flair/hyperparameter/param_selection.py delete mode 100644 flair/hyperparameter/parameter.py delete mode 100644 resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md delete mode 100644 tests/test_hyperparameter.py diff --git a/flair/hyperparameter/__init__.py b/flair/hyperparameter/__init__.py deleted file mode 100644 index 89ff46aaca..0000000000 --- a/flair/hyperparameter/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from .parameter import ( - Parameter, - SEQUENCE_TAGGER_PARAMETERS, - TRAINING_PARAMETERS, - DOCUMENT_EMBEDDING_PARAMETERS, -) -from .param_selection import ( - SequenceTaggerParamSelector, - TextClassifierParamSelector, - SearchSpace, -) diff --git a/flair/hyperparameter/param_selection.py b/flair/hyperparameter/param_selection.py deleted file mode 100644 index 2f84738135..0000000000 --- a/flair/hyperparameter/param_selection.py +++ /dev/null @@ -1,277 +0,0 @@ -import logging -from abc import abstractmethod -from enum import Enum -from pathlib import Path -from typing import Tuple, Union -import numpy as np - -from hyperopt import hp, fmin, tpe - -import flair.nn -from flair.data import Corpus -from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings -from flair.hyperparameter import Parameter -from flair.hyperparameter.parameter import ( - SEQUENCE_TAGGER_PARAMETERS, - TRAINING_PARAMETERS, - DOCUMENT_EMBEDDING_PARAMETERS, - MODEL_TRAINER_PARAMETERS, -) -from flair.models import SequenceTagger, TextClassifier -from flair.trainers import ModelTrainer -from flair.training_utils import ( - EvaluationMetric, - log_line, - init_output_file, - add_file_handler, -) - -log = logging.getLogger("flair") - - -class OptimizationValue(Enum): - DEV_LOSS = "loss" - DEV_SCORE = "score" - - -class SearchSpace(object): - def __init__(self): - self.search_space = {} - - def add(self, parameter: Parameter, func, **kwargs): - self.search_space[parameter.value] = func(parameter.value, **kwargs) - - def get_search_space(self): - return hp.choice("parameters", [self.search_space]) - - -class ParamSelector(object): - def __init__( - self, - corpus: Corpus, - base_path: Union[str, Path], - max_epochs: int, - evaluation_metric: EvaluationMetric, - training_runs: int, - optimization_value: OptimizationValue, - ): - if type(base_path) is str: - base_path = Path(base_path) - - self.corpus = corpus - self.max_epochs = max_epochs - self.base_path = base_path - self.evaluation_metric = evaluation_metric - self.run = 1 - self.training_runs = training_runs - self.optimization_value = optimization_value - - self.param_selection_file = init_output_file(base_path, "param_selection.txt") - - @abstractmethod - def _set_up_model(self, params: dict) -> flair.nn.Model: - pass - - def _objective(self, params: dict): - log_line(log) - log.info(f"Evaluation run: {self.run}") - log.info(f"Evaluating parameter combination:") - for k, v in params.items(): - if isinstance(v, Tuple): - v = ",".join([str(x) for x in v]) - log.info(f"\t{k}: {str(v)}") - log_line(log) - - for sent in self.corpus.get_all_sentences(): - sent.clear_embeddings() - - scores = [] - vars = [] - - for i in range(0, self.training_runs): - log_line(log) - log.info(f"Training run: {i + 1}") - - model = self._set_up_model(params) - - training_params = { - key: params[key] for key in params if key in TRAINING_PARAMETERS - } - model_trainer_parameters = { - key: params[key] for key in params if key in MODEL_TRAINER_PARAMETERS - } - - trainer: ModelTrainer = ModelTrainer( - model, self.corpus, **model_trainer_parameters - ) - - result = trainer.train( - self.base_path, - max_epochs=self.max_epochs, - param_selection_mode=True, - **training_params, - ) - - # take the average over the last three scores of training - if self.optimization_value == OptimizationValue.DEV_LOSS: - curr_scores = result["dev_loss_history"][-3:] - else: - curr_scores = list( - map(lambda s: 1 - s, result["dev_score_history"][-3:]) - ) - - score = sum(curr_scores) / float(len(curr_scores)) - var = np.var(curr_scores) - scores.append(score) - vars.append(var) - - # take average over the scores from the different training runs - final_score = sum(scores) / float(len(scores)) - final_var = sum(vars) / float(len(vars)) - - test_score = result["test_score"] - log_line(log) - log.info(f"Done evaluating parameter combination:") - for k, v in params.items(): - if isinstance(v, Tuple): - v = ",".join([str(x) for x in v]) - log.info(f"\t{k}: {v}") - log.info(f"{self.optimization_value.value}: {final_score}") - log.info(f"variance: {final_var}") - log.info(f"test_score: {test_score}\n") - log_line(log) - - with open(self.param_selection_file, "a") as f: - f.write(f"evaluation run {self.run}\n") - for k, v in params.items(): - if isinstance(v, Tuple): - v = ",".join([str(x) for x in v]) - f.write(f"\t{k}: {str(v)}\n") - f.write(f"{self.optimization_value.value}: {final_score}\n") - f.write(f"variance: {final_var}\n") - f.write(f"test_score: {test_score}\n") - f.write("-" * 100 + "\n") - - self.run += 1 - - return {"status": "ok", "loss": final_score, "loss_variance": final_var} - - def optimize(self, space: SearchSpace, max_evals=100): - search_space = space.search_space - best = fmin( - self._objective, search_space, algo=tpe.suggest, max_evals=max_evals - ) - - log_line(log) - log.info("Optimizing parameter configuration done.") - log.info("Best parameter configuration found:") - for k, v in best.items(): - log.info(f"\t{k}: {v}") - log_line(log) - - with open(self.param_selection_file, "a") as f: - f.write("best parameter combination\n") - for k, v in best.items(): - if isinstance(v, Tuple): - v = ",".join([str(x) for x in v]) - f.write(f"\t{k}: {str(v)}\n") - - -class SequenceTaggerParamSelector(ParamSelector): - def __init__( - self, - corpus: Corpus, - tag_type: str, - base_path: Union[str, Path], - max_epochs: int = 50, - evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, - training_runs: int = 1, - optimization_value: OptimizationValue = OptimizationValue.DEV_LOSS, - ): - """ - :param corpus: the corpus - :param tag_type: tag type to use - :param base_path: the path to the result folder (results will be written to that folder) - :param max_epochs: number of epochs to perform on every evaluation run - :param evaluation_metric: evaluation metric used during training - :param training_runs: number of training runs per evaluation run - :param optimization_value: value to optimize - """ - super().__init__( - corpus, - base_path, - max_epochs, - evaluation_metric, - training_runs, - optimization_value, - ) - - self.tag_type = tag_type - self.tag_dictionary = self.corpus.make_label_dictionary(self.tag_type) - - def _set_up_model(self, params: dict): - sequence_tagger_params = { - key: params[key] for key in params if key in SEQUENCE_TAGGER_PARAMETERS - } - - tagger: SequenceTagger = SequenceTagger( - tag_dictionary=self.tag_dictionary, - tag_type=self.tag_type, - **sequence_tagger_params, - ) - return tagger - - -class TextClassifierParamSelector(ParamSelector): - def __init__( - self, - corpus: Corpus, - multi_label: bool, - base_path: Union[str, Path], - document_embedding_type: str, - max_epochs: int = 50, - evaluation_metric: EvaluationMetric = EvaluationMetric.MICRO_F1_SCORE, - training_runs: int = 1, - optimization_value: OptimizationValue = OptimizationValue.DEV_LOSS, - ): - """ - :param corpus: the corpus - :param multi_label: true, if the dataset is multi label, false otherwise - :param base_path: the path to the result folder (results will be written to that folder) - :param document_embedding_type: either 'lstm', 'mean', 'min', or 'max' - :param max_epochs: number of epochs to perform on every evaluation run - :param evaluation_metric: evaluation metric used during training - :param training_runs: number of training runs per evaluation run - :param optimization_value: value to optimize - """ - super().__init__( - corpus, - base_path, - max_epochs, - evaluation_metric, - training_runs, - optimization_value, - ) - - self.multi_label = multi_label - self.document_embedding_type = document_embedding_type - - self.label_dictionary = self.corpus.make_label_dictionary() - - def _set_up_model(self, params: dict): - embdding_params = { - key: params[key] for key in params if key in DOCUMENT_EMBEDDING_PARAMETERS - } - - if self.document_embedding_type == "lstm": - document_embedding = DocumentRNNEmbeddings(**embdding_params) - else: - document_embedding = DocumentPoolEmbeddings(**embdding_params) - - text_classifier: TextClassifier = TextClassifier( - label_dictionary=self.label_dictionary, - multi_label=self.multi_label, - document_embeddings=document_embedding, - ) - - return text_classifier diff --git a/flair/hyperparameter/parameter.py b/flair/hyperparameter/parameter.py deleted file mode 100644 index 0e47aa791b..0000000000 --- a/flair/hyperparameter/parameter.py +++ /dev/null @@ -1,66 +0,0 @@ -from enum import Enum - - -class Parameter(Enum): - EMBEDDINGS = "embeddings" - HIDDEN_SIZE = "hidden_size" - USE_CRF = "use_crf" - USE_RNN = "use_rnn" - RNN_LAYERS = "rnn_layers" - DROPOUT = "dropout" - WORD_DROPOUT = "word_dropout" - LOCKED_DROPOUT = "locked_dropout" - LEARNING_RATE = "learning_rate" - MINI_BATCH_SIZE = "mini_batch_size" - ANNEAL_FACTOR = "anneal_factor" - ANNEAL_WITH_RESTARTS = "anneal_with_restarts" - PATIENCE = "patience" - REPROJECT_WORDS = "reproject_words" - REPROJECT_WORD_DIMENSION = "reproject_words_dimension" - BIDIRECTIONAL = "bidirectional" - OPTIMIZER = "optimizer" - MOMENTUM = "momentum" - DAMPENING = "dampening" - WEIGHT_DECAY = "weight_decay" - NESTEROV = "nesterov" - AMSGRAD = "amsgrad" - BETAS = "betas" - EPS = "eps" - - -TRAINING_PARAMETERS = [ - Parameter.LEARNING_RATE.value, - Parameter.MINI_BATCH_SIZE.value, - Parameter.ANNEAL_FACTOR.value, - Parameter.PATIENCE.value, - Parameter.ANNEAL_WITH_RESTARTS.value, - Parameter.MOMENTUM.value, - Parameter.DAMPENING.value, - Parameter.WEIGHT_DECAY.value, - Parameter.NESTEROV.value, - Parameter.AMSGRAD.value, - Parameter.BETAS.value, - Parameter.EPS.value, -] -SEQUENCE_TAGGER_PARAMETERS = [ - Parameter.EMBEDDINGS.value, - Parameter.HIDDEN_SIZE.value, - Parameter.RNN_LAYERS.value, - Parameter.USE_CRF.value, - Parameter.USE_RNN.value, - Parameter.DROPOUT.value, - Parameter.LOCKED_DROPOUT.value, - Parameter.WORD_DROPOUT.value, -] -MODEL_TRAINER_PARAMETERS = [Parameter.OPTIMIZER.value] -DOCUMENT_EMBEDDING_PARAMETERS = [ - Parameter.EMBEDDINGS.value, - Parameter.HIDDEN_SIZE.value, - Parameter.RNN_LAYERS.value, - Parameter.REPROJECT_WORDS.value, - Parameter.REPROJECT_WORD_DIMENSION.value, - Parameter.BIDIRECTIONAL.value, - Parameter.DROPOUT.value, - Parameter.LOCKED_DROPOUT.value, - Parameter.WORD_DROPOUT.value, -] diff --git a/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md b/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md deleted file mode 100644 index 04843f956f..0000000000 --- a/resources/docs/TUTORIAL_8_MODEL_OPTIMIZATION.md +++ /dev/null @@ -1,166 +0,0 @@ -# Tutorial 8: Model Tuning - -This is part 8 of the tutorial, in which we look into how we can improve the quality of our model by selecting -the right set of model and hyper parameters. - -## Selecting Hyper Parameters - -Flair includes a wrapper for the well-known hyper parameter selection tool -[hyperopt](https://github.com/hyperopt/hyperopt). - -First you need to load your corpus. If you want to load the [AGNews corpus](https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html) -used in the following example, you first need to download it and convert it into the correct format. Please -check [tutorial 6](/resources/docs/TUTORIAL_6_CORPUS.md) for more details. -```python -from flair.datasets import TREC_6 - -# load your corpus -corpus = TREC_6() -``` - -Second you need to define the search space of parameters. -Therefore, you can use all -[parameter expressions](https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions) defined by hyperopt. - -```python -from hyperopt import hp -from flair.hyperparameter.param_selection import SearchSpace, Parameter - -# define your search space -search_space = SearchSpace() -search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ - [ WordEmbeddings('en') ], - [ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] -]) -search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128]) -search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) -search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) -search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2]) -search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 32]) -``` - -Attention: You should always add your embeddings to the search space (as shown above). If you don't want to test -different kind of embeddings, simply pass just one embedding option to the search space, which will then be used in -every test run. Here is an example: -```python -search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ - [ FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ] -]) -``` - -In the last step you have to create the actual parameter selector. -Depending on the task you need either to define a `TextClassifierParamSelector` or a `SequenceTaggerParamSelector` and -start the optimization. -You can define the maximum number of evaluation runs hyperopt should perform (`max_evals`). -A evaluation run performs the specified number of epochs (`max_epochs`). -To overcome the issue of noisy evaluation scores, we take the average over the last three evaluation scores (either -`dev_score` or `dev_loss`) from the evaluation run, which represents the final score and will be passed to hyperopt. -Additionally, you can specify the number of runs per evaluation run (`training_runs`). -If you specify more than one training run, one evaluation run will be executed the specified number of times. -The final evaluation score will be the average over all those runs. - -```python -from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue - -# create the parameter selector -param_selector = TextClassifierParamSelector( - corpus, - False, - 'resources/results', - 'lstm', - max_epochs=50, - training_runs=3, - optimization_value=OptimizationValue.DEV_SCORE -) - -# start the optimization -param_selector.optimize(search_space, max_evals=100) -``` - -The parameter settings and the evaluation scores will be written to `param_selection.txt` in the result directory. -While selecting the best parameter combination we do not store any model to disk. We also do not perform a test run -during training, we just evaluate the model once after training on the test set for logging purpose. - -## Finding the best Learning Rate - -The learning rate is one of the most important hyper parameter and it fundamentally depends on the topology of the loss -landscape via the architecture of your model and the training data it consumes. An optimal learning will improve your -training speed and hopefully give more performant models. A simple technique described by Leslie Smith's -[Cyclical Learning Rates for Training](https://arxiv.org/abs/1506.01186) paper is to train your model starting with a -very low learning rate and increases the learning rate exponentially at every batch update of SGD. By plotting the loss -with respect to the learning rate we will typically observe three distinct phases: for low learning rates the loss does -not improve, an optimal learning rate range where the loss drops the steepest and the final phase where the loss -explodes as the learning rate becomes too big. With such a plot, the optimal learning rate selection is as easy as -picking the highest one from the optimal phase. - -In order to run such an experiment start with your initialized `ModelTrainer` and call `find_learning_rate()` with the -`base_path` and the file name in which to records the learning rates and losses. Then plot the generated results via the -`Plotter`'s `plot_learning_rate()` function and have a look at the `learning_rate.png` image to select the optimal -learning rate: - -```python -from flair.datasets import WNUT_17 -from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings -from flair.trainers import ModelTrainer -from typing import List - -# 1. get the corpus -corpus = WNUT_17().downsample(0.1) -print(corpus) - -# 2. what tag do we want to predict? -tag_type = 'ner' - -# 3. make the tag dictionary from the corpus -tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) -print(tag_dictionary.idx2item) - -# 4. initialize embeddings -embedding_types: List[TokenEmbeddings] = [ - WordEmbeddings('glove'), -] - -embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) - -# 5. initialize sequence tagger -from flair.models import SequenceTagger - -tagger: SequenceTagger = SequenceTagger(hidden_size=256, - embeddings=embeddings, - tag_dictionary=tag_dictionary, - tag_type=tag_type, - use_crf=True) - -# 6. initialize trainer -trainer: ModelTrainer = ModelTrainer(tagger, corpus) - -# 7. find learning rate -learning_rate_tsv = trainer.find_learning_rate('resources/taggers/example-ner', - 'learning_rate.tsv') - -# 8. plot the learning rate finder curve -from flair.visual.training_curves import Plotter -plotter = Plotter() -plotter.plot_learning_rate(learning_rate_tsv) -``` - -## Custom Optimizers - -You can now use any of PyTorch's optimizers for training when initializing a `ModelTrainer`. To give the optimizer any -extra options just specify it as shown with the `weight_decay` example: - -```python -from torch.optim.adam import Adam - -trainer = ModelTrainer(tagger, corpus, - optimizer=Adam) - -trainer.train( - "resources/taggers/example", - weight_decay=1e-4 -) -``` - -## Next - -The last tutorial is about [training your own embeddings](/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md). diff --git a/tests/test_hyperparameter.py b/tests/test_hyperparameter.py deleted file mode 100644 index 48321bc338..0000000000 --- a/tests/test_hyperparameter.py +++ /dev/null @@ -1,92 +0,0 @@ -import shutil - -import pytest -from hyperopt import hp -from torch.optim import SGD - -from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings -from flair.hyperparameter import ( - SearchSpace, - Parameter, - SequenceTaggerParamSelector, - TextClassifierParamSelector, -) -import flair.datasets - -glove_embedding: WordEmbeddings = WordEmbeddings("glove") - - -@pytest.mark.skip -def test_sequence_tagger_param_selector(results_base_path, tasks_base_path): - corpus = flair.datasets.ColumnCorpus( - data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"} - ) - - # define search space - search_space = SearchSpace() - - # sequence tagger parameter - search_space.add( - Parameter.EMBEDDINGS, - hp.choice, - options=[StackedEmbeddings([glove_embedding])], - ) - search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) - search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75) - search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25) - search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5) - search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128]) - search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) - - # model trainer parameter - search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD]) - - # training parameter - search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32]) - search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1) - search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75) - search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) - search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1) - - # find best parameter settings - optimizer = SequenceTaggerParamSelector( - corpus, "ner", results_base_path, max_epochs=2 - ) - optimizer.optimize(search_space, max_evals=2) - - # clean up results directory - shutil.rmtree(results_base_path) - del optimizer, search_space - - -@pytest.mark.skip -def test_text_classifier_param_selector(results_base_path, tasks_base_path): - corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") - - search_space = SearchSpace() - - # document embeddings parameter - search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[[glove_embedding]]) - search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128, 256, 512]) - search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) - search_space.add(Parameter.REPROJECT_WORDS, hp.choice, options=[True, False]) - search_space.add(Parameter.REPROJECT_WORD_DIMENSION, hp.choice, options=[64, 128]) - search_space.add(Parameter.BIDIRECTIONAL, hp.choice, options=[True, False]) - search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75) - search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.25, high=0.75) - search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.25, high=0.75) - - # training parameter - search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0, high=1) - search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 16, 32]) - search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0, high=0.75) - search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) - - param_selector = TextClassifierParamSelector( - corpus, False, results_base_path, document_embedding_type="lstm", max_epochs=2 - ) - param_selector.optimize(search_space, max_evals=2) - - # clean up results directory - shutil.rmtree(results_base_path) - del param_selector, search_space