From f3d77775fca7a4f6ae537326716cde2ead4340ff Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 12:14:42 +0300 Subject: [PATCH 01/21] stage result --- .../optimization/utils => configs}/name.py | 6 + autointent/configs/optimization_cli.py | 2 +- autointent/context/context.py | 157 +++++++++++++++--- .../{ => context/optimization_info}/logger.py | 0 .../optimization_info/optimization_info.py | 2 +- autointent/modules/retrieval/vectordb.py | 8 +- autointent/modules/scoring/knn/knn.py | 8 +- autointent/modules/scoring/linear.py | 8 +- .../nodes/optimization/node_optimizer.py | 15 +- .../pipeline/optimization/cli_endpoint.py | 21 +-- .../optimization/pipeline_optimizer.py | 44 ----- .../pipeline/optimization/utils/__init__.py | 8 +- autointent/pipeline/optimization/utils/cli.py | 27 --- .../pipeline/optimization/utils/dump.py | 20 --- 14 files changed, 177 insertions(+), 149 deletions(-) rename autointent/{pipeline/optimization/utils => configs}/name.py (95%) rename autointent/{ => context/optimization_info}/logger.py (100%) delete mode 100644 autointent/pipeline/optimization/utils/dump.py diff --git a/autointent/pipeline/optimization/utils/name.py b/autointent/configs/name.py similarity index 95% rename from autointent/pipeline/optimization/utils/name.py rename to autointent/configs/name.py index 73e1be68..8caaed8e 100644 --- a/autointent/pipeline/optimization/utils/name.py +++ b/autointent/configs/name.py @@ -1,4 +1,5 @@ import random +from datetime import datetime adjectives = [ "adorable", @@ -342,3 +343,8 @@ def generate_name() -> str: adjective = random.choice(adjectives) noun = random.choice(nouns) return f"{adjective}_{noun}" + +def get_run_name(run_name: str | None = None) -> str: + if run_name is None: + run_name = generate_name() + return f"{run_name}_{datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}" # noqa: DTZ005 diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py index bcc2d600..576ba64f 100644 --- a/autointent/configs/optimization_cli.py +++ b/autointent/configs/optimization_cli.py @@ -6,7 +6,7 @@ from hydra.core.config_store import ConfigStore from omegaconf import MISSING -from autointent.pipeline.optimization.utils import generate_name +from .name import generate_name @dataclass diff --git a/autointent/context/context.py b/autointent/context/context.py index d2d46014..1ea32b2b 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -1,42 +1,77 @@ +import importlib.resources as ires +import json +import logging from dataclasses import asdict from pathlib import Path from typing import Any +import numpy as np +import yaml +from omegaconf import ListConfig + +from autointent.configs.optimization_cli import ( + AugmentationConfig, + DataConfig, + EmbedderConfig, + LoggingConfig, + VectorIndexConfig, +) + from .data_handler import DataAugmenter, DataHandler, Dataset from .optimization_info import OptimizationInfo from .vector_index_client import VectorIndex, VectorIndexClient class Context: - def __init__( # noqa: PLR0913 + data_handler: DataHandler + vector_index_client: VectorIndexClient + optimization_info: OptimizationInfo + + def __init__( self, - dataset: Dataset, - test_dataset: Dataset | None = None, - device: str = "cpu", - multilabel_generation_config: str | None = None, - regex_sampling: int = 0, seed: int = 42, - db_dir: str | Path | None = None, - dump_dir: str | Path | None = None, - force_multilabel: bool = False, - embedder_batch_size: int = 32, - embedder_max_length: int | None = None, ) -> None: - augmenter = DataAugmenter(multilabel_generation_config, regex_sampling, seed) + self.seed = seed + self._logger = logging.getLogger(__name__) + + def config_logs(self, config: LoggingConfig) -> None: + self.logging_config = config + self.optimization_info = OptimizationInfo() + + def config_vector_index(self, config: VectorIndexConfig, embedder_config: EmbedderConfig | None = None) -> None: + self.vector_index_config = config + if embedder_config is None: + embedder_config = EmbedderConfig() + self.embedder_config = embedder_config + + self.vector_index_client = VectorIndexClient( + self.vector_index_config.device, + self.vector_index_config.db_dir, + self.embedder_config.batch_size, + self.embedder_config.max_length, + ) + + def config_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None: + if augmentation_config is not None: + self.augmentation_config = AugmentationConfig() + augmenter = DataAugmenter( + self.augmentation_config.multilabel_generation_config, + self.augmentation_config.regex_sampling, + self.seed, + ) + else: + augmenter = None + self.data_handler = DataHandler( - dataset, test_dataset, random_seed=seed, force_multilabel=force_multilabel, augmenter=augmenter + dataset=load_data(config.train_path), + test_dataset=None if config.test_path is None else load_data(config.test_path), + random_seed=self.seed, + force_multilabel=config.force_multilabel, + augmenter=augmenter, ) - self.optimization_info = OptimizationInfo() - self.vector_index_client = VectorIndexClient(device, db_dir, embedder_batch_size, embedder_max_length) - self.db_dir = self.vector_index_client.db_dir - self.embedder_max_length = embedder_max_length - self.embedder_batch_size = embedder_batch_size - self.device = device self.multilabel = self.data_handler.multilabel self.n_classes = self.data_handler.n_classes - self.seed = seed - self.dump_dir = Path.cwd() / "modules_dumps" if dump_dir is None else Path(dump_dir) def get_best_index(self) -> VectorIndex: model_name = self.optimization_info.get_best_embedder() @@ -55,3 +90,83 @@ def get_inference_config(self) -> dict[str, Any]: }, "nodes_configs": nodes_configs, } + + def dump(self) -> None: + self._logger.debug("dumping logs...") + optimization_results = self.optimization_info.dump_evaluation_results() + + logs_dir = self.logging_config.dirpath + + # create appropriate directory + logs_dir.mkdir(parents=True, exist_ok=True) + + # dump search space and evaluation results + logs_path = logs_dir / "logs.json" + with logs_path.open("w") as file: + json.dump(optimization_results, file, indent=4, ensure_ascii=False, cls=NumpyEncoder) + # config_path = logs_dir / "config.yaml" + # with config_path.open("w") as file: + # yaml.dump(self.config, file) + + # self._logger.info(make_report(optimization_results, nodes=nodes)) + + # dump train and test data splits + train_data, test_data = self.data_handler.dump() + train_path = logs_dir / "train_data.json" + test_path = logs_dir / "test_data.json" + with train_path.open("w") as file: + json.dump(train_data, file, indent=4, ensure_ascii=False) + with test_path.open("w") as file: + json.dump(test_data, file, indent=4, ensure_ascii=False) + + self._logger.info("logs and other assets are saved to %s", logs_dir) + + # dump optimization results (config for inference) + inference_config = self.get_inference_config() + inference_config_path = logs_dir / "inference_config.yaml" + with inference_config_path.open("w") as file: + yaml.dump(inference_config, file) + + def get_db_dir(self) -> Path: + return self.vector_index_client.db_dir + + def get_device(self) -> str: + return self.vector_index_client.device + + def get_batch_size(self) -> int: + return self.vector_index_client.embedder_batch_size + + def get_max_length(self) -> int | None: + return self.vector_index_client.embedder_max_length + + def get_dump_dir(self) -> Path: + return self.logging_config.dump_dir + +class NumpyEncoder(json.JSONEncoder): + """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422""" + + def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN401 + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, ListConfig): + return list(obj) + return super().default(obj) + + +def load_data(data_path: str | Path) -> Dataset: + """load data from the given path or load sample data which is distributed along with the autointent package""" + if data_path == "default-multiclass": + with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file: + res = json.load(file) + elif data_path == "default-multilabel": + with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file: + res = json.load(file) + else: + with Path(data_path).open() as file: + res = json.load(file) + + return Dataset.model_validate(res) diff --git a/autointent/logger.py b/autointent/context/optimization_info/logger.py similarity index 100% rename from autointent/logger.py rename to autointent/context/optimization_info/logger.py diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 36795151..3cfa2a72 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -4,9 +4,9 @@ from numpy.typing import NDArray from autointent.configs.node import InferenceNodeConfig -from autointent.logger import get_logger from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds +from .logger import get_logger class OptimizationInfo: diff --git a/autointent/modules/retrieval/vectordb.py b/autointent/modules/retrieval/vectordb.py index cae9beb6..b1e55380 100644 --- a/autointent/modules/retrieval/vectordb.py +++ b/autointent/modules/retrieval/vectordb.py @@ -51,10 +51,10 @@ def from_context( return cls( k=k, model_name=model_name, - db_dir=str(context.db_dir), - device=context.device, - batch_size=context.embedder_batch_size, - max_length=context.embedder_max_length, + db_dir=str(context.get_db_dir()), + device=context.get_device(), + batch_size=context.get_batch_size(), + max_length=context.get_max_length(), ) def fit(self, utterances: list[str], labels: list[LabelType]) -> None: diff --git a/autointent/modules/scoring/knn/knn.py b/autointent/modules/scoring/knn/knn.py index 0c53cadf..420bf604 100644 --- a/autointent/modules/scoring/knn/knn.py +++ b/autointent/modules/scoring/knn/knn.py @@ -76,10 +76,10 @@ def from_context( model_name=model_name, k=k, weights=weights, - db_dir=str(context.db_dir), - device=context.device, - batch_size=context.embedder_batch_size, - max_length=context.embedder_max_length, + db_dir=str(context.get_db_dir()), + device=context.get_device(), + batch_size=context.get_batch_size(), + max_length=context.get_max_length(), ) instance.prebuilt_index = prebuilt_index return instance diff --git a/autointent/modules/scoring/linear.py b/autointent/modules/scoring/linear.py index adab5b43..685eb74b 100644 --- a/autointent/modules/scoring/linear.py +++ b/autointent/modules/scoring/linear.py @@ -77,13 +77,13 @@ def from_context( instance = cls( model_name=model_name, - device=context.device, + device=context.get_device(), seed=context.seed, - batch_size=context.embedder_batch_size, - max_length=context.embedder_max_length, + batch_size=context.get_batch_size(), + max_length=context.get_max_length(), ) instance.precomputed_embeddings = precomputed_embeddings - instance.db_dir = str(context.db_dir) + instance.db_dir = str(context.get_db_dir()) return instance def fit( diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index 72139644..77c94652 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -46,7 +46,7 @@ def fit(self, context: Context) -> None: metric_value = module.score(context, self.node_info.metrics_available[self.metric_name]) assets = module.get_assets() - module_dump_dir = self.get_module_dump_dir(context.dump_dir, module_type, j_combination) + module_dump_dir = self.get_module_dump_dir(context.get_dump_dir(), module_type, j_combination) module.dump(module_dump_dir) context.optimization_info.log_module_optimization( @@ -88,3 +88,16 @@ def module_fit(self, module: Module, context: Context) -> None: self._logger.error(msg) raise ValueError(msg) module.fit(*args) # type: ignore[arg-type] + + # @overload + # def fit( + # self, + # utterances: list[str], + # labels: list[LabelType], + # tags: list[Tag] | None = None, + # label_descriptions: list[str] | None = None, + # ) -> None: + # # create context object from given data + # ... + + # # call fit(context) diff --git a/autointent/pipeline/optimization/cli_endpoint.py b/autointent/pipeline/optimization/cli_endpoint.py index eb390804..806ebb45 100644 --- a/autointent/pipeline/optimization/cli_endpoint.py +++ b/autointent/pipeline/optimization/cli_endpoint.py @@ -6,7 +6,7 @@ from autointent.configs.optimization_cli import OptimizationConfig from .pipeline_optimizer import PipelineOptimizer -from .utils import load_config, load_data +from .utils import load_config @hydra.main(config_name="optimization_config", config_path=".", version_base=None) @@ -18,19 +18,10 @@ def main(cfg: OptimizationConfig) -> None: logger.debug("Vector index path: %s", cfg.vector_index.db_dir) # create shared objects for a whole pipeline - context = Context( - load_data(cfg.data.train_path), - None if cfg.data.test_path is None else load_data(cfg.data.test_path), - cfg.vector_index.device, - cfg.augmentation.multilabel_generation_config, - cfg.augmentation.regex_sampling, - cfg.seed, - cfg.vector_index.db_dir, - cfg.logs.dump_dir, - cfg.data.force_multilabel, - cfg.embedder.batch_size, - cfg.embedder.max_length, - ) + context = Context(cfg.seed) + context.config_logs(cfg.logs) + context.config_vector_index(cfg.vector_index, cfg.embedder) + context.config_data(cfg.data, cfg.augmentation) # run optimization search_space_config = load_config(cfg.task.search_space_path, context.multilabel, logger) @@ -38,4 +29,4 @@ def main(cfg: OptimizationConfig) -> None: pipeline.optimize(context) # save results - pipeline.dump(cfg.logs.dirpath) + context.dump() diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index d0532164..5125b8c5 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -1,18 +1,14 @@ import json import logging -from pathlib import Path from typing import Any import numpy as np -import yaml from hydra.utils import instantiate from autointent import Context from autointent.configs.pipeline_optimizer import PipelineOptimizerConfig from autointent.nodes import NodeOptimizer -from .utils import NumpyEncoder - class PipelineOptimizer: def __init__(self, nodes: list[NodeOptimizer]) -> None: @@ -29,46 +25,6 @@ def optimize(self, context: Context) -> None: for node_optimizer in self.nodes: node_optimizer.fit(context) - def dump(self, logs_dir: str | Path | None) -> None: - self._logger.debug("dumping logs...") - optimization_results = self.context.optimization_info.dump_evaluation_results() - - if logs_dir is None: - logs_dir = Path.cwd() / "pipeline_optimize" - if isinstance(logs_dir, str): - logs_dir = Path(logs_dir) - - # create appropriate directory - logs_dir.mkdir(parents=True, exist_ok=True) - - # dump search space and evaluation results - logs_path = logs_dir / "logs.json" - with logs_path.open("w") as file: - json.dump(optimization_results, file, indent=4, ensure_ascii=False, cls=NumpyEncoder) - # config_path = logs_dir / "config.yaml" - # with config_path.open("w") as file: - # yaml.dump(self.config, file) - - nodes = [node_config.node_info.node_type for node_config in self.nodes] - self._logger.info(make_report(optimization_results, nodes=nodes)) - - # dump train and test data splits - train_data, test_data = self.context.data_handler.dump() - train_path = logs_dir / "train_data.json" - test_path = logs_dir / "test_data.json" - with train_path.open("w") as file: - json.dump(train_data, file, indent=4, ensure_ascii=False) - with test_path.open("w") as file: - json.dump(test_data, file, indent=4, ensure_ascii=False) - - self._logger.info("logs and other assets are saved to %s", logs_dir) - - # dump optimization results (config for inference) - inference_config = self.context.get_inference_config() - inference_config_path = logs_dir / "inference_config.yaml" - with inference_config_path.open("w") as file: - yaml.dump(inference_config, file) - def make_report(logs: dict[str, Any], nodes: list[str]) -> str: ids = [np.argmax(logs["metrics"][node]) for node in nodes] diff --git a/autointent/pipeline/optimization/utils/__init__.py b/autointent/pipeline/optimization/utils/__init__.py index 4bd0f741..e0aa17b6 100644 --- a/autointent/pipeline/optimization/utils/__init__.py +++ b/autointent/pipeline/optimization/utils/__init__.py @@ -1,12 +1,6 @@ -from .cli import get_logs_dir, get_run_name, load_config, load_data -from .dump import NumpyEncoder -from .name import generate_name +from .cli import get_logs_dir, load_config __all__ = [ - "NumpyEncoder", - "generate_name", - "get_run_name", "load_config", - "load_data", "get_logs_dir", ] diff --git a/autointent/pipeline/optimization/utils/cli.py b/autointent/pipeline/optimization/utils/cli.py index e49d0624..2cd4e911 100644 --- a/autointent/pipeline/optimization/utils/cli.py +++ b/autointent/pipeline/optimization/utils/cli.py @@ -1,37 +1,10 @@ import importlib.resources as ires -import json -from datetime import datetime from logging import Logger from pathlib import Path from typing import Any import yaml -from autointent.context.data_handler import Dataset - -from .name import generate_name - - -def load_data(data_path: str | Path) -> Dataset: - """load data from the given path or load sample data which is distributed along with the autointent package""" - if data_path == "default-multiclass": - with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file: - res = json.load(file) - elif data_path == "default-multilabel": - with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file: - res = json.load(file) - else: - with Path(data_path).open() as file: - res = json.load(file) - - return Dataset.model_validate(res) - - -def get_run_name(run_name: str | None = None) -> str: - if run_name is None: - run_name = generate_name() - return f"{run_name}_{datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}" # noqa: DTZ005 - def get_logs_dir(run_name: str, logs_dir: Path | None = None) -> Path: if logs_dir is None: diff --git a/autointent/pipeline/optimization/utils/dump.py b/autointent/pipeline/optimization/utils/dump.py deleted file mode 100644 index 95167a5e..00000000 --- a/autointent/pipeline/optimization/utils/dump.py +++ /dev/null @@ -1,20 +0,0 @@ -import json -from typing import Any - -import numpy as np -from omegaconf import ListConfig - - -class NumpyEncoder(json.JSONEncoder): - """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422""" - - def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN401 - if isinstance(obj, np.integer): - return int(obj) - if isinstance(obj, np.floating): - return float(obj) - if isinstance(obj, np.ndarray): - return obj.tolist() - if isinstance(obj, ListConfig): - return list(obj) - return super().default(obj) From b192dc8505216a010038ebef5ea31c97429c6ed6 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 12:27:04 +0300 Subject: [PATCH 02/21] decompose `Context.__init__()` and implement `get_` methods --- autointent/configs/name.py | 1 + autointent/context/context.py | 16 ++++++++++------ autointent/modules/prediction/base.py | 4 +++- autointent/modules/prediction/threshold.py | 4 ++-- autointent/modules/scoring/dnnc/dnnc.py | 8 ++++---- autointent/pipeline/optimization/cli_endpoint.py | 2 +- 6 files changed, 21 insertions(+), 14 deletions(-) diff --git a/autointent/configs/name.py b/autointent/configs/name.py index 8caaed8e..5d639217 100644 --- a/autointent/configs/name.py +++ b/autointent/configs/name.py @@ -344,6 +344,7 @@ def generate_name() -> str: noun = random.choice(nouns) return f"{adjective}_{noun}" + def get_run_name(run_name: str | None = None) -> str: if run_name is None: run_name = generate_name() diff --git a/autointent/context/context.py b/autointent/context/context.py index 1ea32b2b..ec2ebac1 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -70,9 +70,6 @@ def config_data(self, config: DataConfig, augmentation_config: AugmentationConfi augmenter=augmenter, ) - self.multilabel = self.data_handler.multilabel - self.n_classes = self.data_handler.n_classes - def get_best_index(self) -> VectorIndex: model_name = self.optimization_info.get_best_embedder() return self.vector_index_client.get_index(model_name) @@ -83,9 +80,9 @@ def get_inference_config(self) -> dict[str, Any]: cfg.pop("_target_") return { "metadata": { - "device": self.device, - "multilabel": self.multilabel, - "n_classes": self.n_classes, + "device": self.get_device(), + "multilabel": self.is_multilabel(), + "n_classes": self.get_n_classes(), "seed": self.seed, }, "nodes_configs": nodes_configs, @@ -142,6 +139,13 @@ def get_max_length(self) -> int | None: def get_dump_dir(self) -> Path: return self.logging_config.dump_dir + def is_multilabel(self) -> bool: + return self.data_handler.multilabel + + def get_n_classes(self) -> int: + return self.data_handler.n_classes + + class NumpyEncoder(json.JSONEncoder): """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422""" diff --git a/autointent/modules/prediction/base.py b/autointent/modules/prediction/base.py index 06cb2c14..17b8de50 100644 --- a/autointent/modules/prediction/base.py +++ b/autointent/modules/prediction/base.py @@ -51,7 +51,9 @@ def get_prediction_evaluation_data( oos_scores = context.optimization_info.get_best_oos_scores() return_scores = scores if oos_scores is not None: - oos_labels = [[0] * context.n_classes] * len(oos_scores) if context.multilabel else [-1] * len(oos_scores) # type: ignore[list-item] + oos_labels = ( + [[0] * context.get_n_classes()] * len(oos_scores) if context.is_multilabel() else [-1] * len(oos_scores) + ) # type: ignore[list-item] labels = np.concatenate([labels, np.array(oos_labels)]) return_scores = np.concatenate([scores, oos_scores]) diff --git a/autointent/modules/prediction/threshold.py b/autointent/modules/prediction/threshold.py index 5e43533f..c25c3270 100644 --- a/autointent/modules/prediction/threshold.py +++ b/autointent/modules/prediction/threshold.py @@ -44,8 +44,8 @@ def __init__( def from_context(cls, context: Context, thresh: float | npt.NDArray[Any] = 0.5) -> Self: return cls( thresh=thresh, - multilabel=context.multilabel, - n_classes=context.n_classes, + multilabel=context.is_multilabel(), + n_classes=context.get_n_classes(), ) def fit( diff --git a/autointent/modules/scoring/dnnc/dnnc.py b/autointent/modules/scoring/dnnc/dnnc.py index 901056d3..2b20d8ce 100644 --- a/autointent/modules/scoring/dnnc/dnnc.py +++ b/autointent/modules/scoring/dnnc/dnnc.py @@ -82,10 +82,10 @@ def from_context( search_model_name=search_model_name, k=k, train_head=train_head, - device=context.device, - db_dir=str(context.db_dir), - batch_size=context.embedder_batch_size, - max_length=context.embedder_max_length, + device=context.get_device(), + db_dir=str(context.get_db_dir()), + batch_size=context.get_batch_size(), + max_length=context.get_max_length(), ) instance.prebuilt_index = prebuilt_index return instance diff --git a/autointent/pipeline/optimization/cli_endpoint.py b/autointent/pipeline/optimization/cli_endpoint.py index 806ebb45..a8bfa975 100644 --- a/autointent/pipeline/optimization/cli_endpoint.py +++ b/autointent/pipeline/optimization/cli_endpoint.py @@ -24,7 +24,7 @@ def main(cfg: OptimizationConfig) -> None: context.config_data(cfg.data, cfg.augmentation) # run optimization - search_space_config = load_config(cfg.task.search_space_path, context.multilabel, logger) + search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger) pipeline = PipelineOptimizer.from_dict_config(search_space_config) pipeline.optimize(context) From 0f6f568d1f6632b20f379cb8aa947d38c286b7e9 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 12:59:59 +0300 Subject: [PATCH 03/21] fix tests --- autointent/context/context.py | 36 +---------------- autointent/context/utils.py | 39 +++++++++++++++++++ .../scoring/description/description.py | 4 +- autointent/modules/scoring/mlknn/mlknn.py | 8 ++-- tests/conftest.py | 16 ++++---- .../datahandler/test_multilabel_generation.py | 13 ++++--- tests/context/test_vector_index.py | 13 ++++--- tests/modules/prediction/test_treshold.py | 19 ++++----- tests/modules/scoring/test_description.py | 7 ++-- tests/modules/scoring/test_dnnc.py | 5 ++- tests/modules/scoring/test_knn.py | 7 ++-- tests/modules/scoring/test_linear.py | 5 ++- tests/modules/scoring/test_mlknn.py | 7 ++-- tests/modules/test_regex.py | 7 ++-- tests/nodes/conftest.py | 12 ++++-- tests/pipeline/test_optimization.py | 25 ++---------- 16 files changed, 113 insertions(+), 110 deletions(-) create mode 100644 autointent/context/utils.py diff --git a/autointent/context/context.py b/autointent/context/context.py index ec2ebac1..23186518 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -1,13 +1,10 @@ -import importlib.resources as ires import json import logging from dataclasses import asdict from pathlib import Path from typing import Any -import numpy as np import yaml -from omegaconf import ListConfig from autointent.configs.optimization_cli import ( AugmentationConfig, @@ -17,8 +14,9 @@ VectorIndexConfig, ) -from .data_handler import DataAugmenter, DataHandler, Dataset +from .data_handler import DataAugmenter, DataHandler from .optimization_info import OptimizationInfo +from .utils import NumpyEncoder, load_data from .vector_index_client import VectorIndex, VectorIndexClient @@ -144,33 +142,3 @@ def is_multilabel(self) -> bool: def get_n_classes(self) -> int: return self.data_handler.n_classes - - -class NumpyEncoder(json.JSONEncoder): - """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422""" - - def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN401 - if isinstance(obj, np.integer): - return int(obj) - if isinstance(obj, np.floating): - return float(obj) - if isinstance(obj, np.ndarray): - return obj.tolist() - if isinstance(obj, ListConfig): - return list(obj) - return super().default(obj) - - -def load_data(data_path: str | Path) -> Dataset: - """load data from the given path or load sample data which is distributed along with the autointent package""" - if data_path == "default-multiclass": - with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file: - res = json.load(file) - elif data_path == "default-multilabel": - with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file: - res = json.load(file) - else: - with Path(data_path).open() as file: - res = json.load(file) - - return Dataset.model_validate(res) diff --git a/autointent/context/utils.py b/autointent/context/utils.py new file mode 100644 index 00000000..efad3884 --- /dev/null +++ b/autointent/context/utils.py @@ -0,0 +1,39 @@ +import importlib.resources as ires +import json +from pathlib import Path +from typing import Any + +import numpy as np +from omegaconf import ListConfig + +from .data_handler import Dataset + + +class NumpyEncoder(json.JSONEncoder): + """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422""" + + def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN401 + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, ListConfig): + return list(obj) + return super().default(obj) + + +def load_data(data_path: str | Path) -> Dataset: + """load data from the given path or load sample data which is distributed along with the autointent package""" + if data_path == "default-multiclass": + with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file: + res = json.load(file) + elif data_path == "default-multilabel": + with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file: + res = json.load(file) + else: + with Path(data_path).open() as file: + res = json.load(file) + + return Dataset.model_validate(res) diff --git a/autointent/modules/scoring/description/description.py b/autointent/modules/scoring/description/description.py index 97faf7ac..51d1a990 100644 --- a/autointent/modules/scoring/description/description.py +++ b/autointent/modules/scoring/description/description.py @@ -63,8 +63,8 @@ def from_context( instance = cls( temperature=temperature, - device=context.device, - db_dir=context.db_dir, + device=context.get_device(), + db_dir=context.get_db_dir(), model_name=model_name, ) instance.precomputed_embeddings = precomputed_embeddings diff --git a/autointent/modules/scoring/mlknn/mlknn.py b/autointent/modules/scoring/mlknn/mlknn.py index e51a223b..fc06eaee 100644 --- a/autointent/modules/scoring/mlknn/mlknn.py +++ b/autointent/modules/scoring/mlknn/mlknn.py @@ -80,10 +80,10 @@ def from_context( model_name=model_name, s=s, ignore_first_neighbours=ignore_first_neighbours, - db_dir=str(context.db_dir), - device=context.device, - batch_size=context.embedder_batch_size, - max_length=context.embedder_max_length, + db_dir=str(context.get_db_dir()), + device=context.get_device(), + batch_size=context.get_batch_size(), + max_length=context.get_max_length(), ) instance.prebuilt_index = prebuilt_index return instance diff --git a/tests/conftest.py b/tests/conftest.py index 83832e6e..08f95300 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,21 +3,21 @@ import pytest -from autointent.pipeline.optimization.utils import load_data +from autointent.context.utils import load_data -@pytest.fixture def setup_environment() -> tuple[str, str]: logs_dir = ires.files("tests").joinpath("logs") + db_dir = logs_dir / "db" / str(uuid4()) + dump_dir = logs_dir / "modules_dump" + return db_dir, dump_dir, logs_dir - def get_db_dir(): - return logs_dir / "db" / str(uuid4()) - dump_dir = logs_dir / "modules_dump" - return get_db_dir, dump_dir, logs_dir +@pytest.fixture +def dataset_path(): + return ires.files("tests.assets.data").joinpath("clinc_subset.json") @pytest.fixture -def dataset(): - dataset_path = ires.files("tests.assets.data").joinpath("clinc_subset.json") +def dataset(dataset_path): return load_data(dataset_path) diff --git a/tests/context/datahandler/test_multilabel_generation.py b/tests/context/datahandler/test_multilabel_generation.py index 6840f29d..9a0bb76b 100644 --- a/tests/context/datahandler/test_multilabel_generation.py +++ b/tests/context/datahandler/test_multilabel_generation.py @@ -4,6 +4,7 @@ from autointent.context.data_handler import DataHandler from autointent.context.vector_index_client import VectorIndexClient +from tests.conftest import setup_environment @pytest.fixture @@ -15,12 +16,12 @@ def mock_data_handler(): @pytest.fixture -def vector_index(setup_environment): - db_dir, dump_dir, logs_dir = setup_environment - return VectorIndexClient(device="cpu", multilabel=False, n_classes=2, db_dir=db_dir()) +def vector_index(): + db_dir, dump_dir, logs_dir = setup_environment() + return VectorIndexClient(device="cpu", multilabel=False, n_classes=2, db_dir=db_dir) -def test_vector_index_initialization(setup_environment): - db_dir, dump_dir, logs_dir = setup_environment - index = VectorIndexClient(device="cpu", db_dir=db_dir()) +def test_vector_index_initialization(): + db_dir, dump_dir, logs_dir = setup_environment() + index = VectorIndexClient(device="cpu", db_dir=db_dir) assert index.device == "cpu" diff --git a/tests/context/test_vector_index.py b/tests/context/test_vector_index.py index 70ca27a3..0505d9a1 100644 --- a/tests/context/test_vector_index.py +++ b/tests/context/test_vector_index.py @@ -1,6 +1,7 @@ import pytest from autointent.context.vector_index_client import VectorIndexClient +from tests.conftest import setup_environment @pytest.fixture @@ -12,15 +13,15 @@ class MockDataHandler: return MockDataHandler() -def test_vector_index_initialization(setup_environment): - db_dir, dump_dir, logs_dir = setup_environment - vector_index_client = VectorIndexClient("cpu", db_dir()) +def test_vector_index_initialization(): + db_dir, dump_dir, logs_dir = setup_environment() + vector_index_client = VectorIndexClient("cpu", db_dir) assert vector_index_client.device == "cpu" -def test_create_collection(data_handler, setup_environment): - db_dir, dump_dir, logs_dir = setup_environment - vector_index_client = VectorIndexClient("cpu", db_dir()) +def test_create_collection(data_handler): + db_dir, dump_dir, logs_dir = setup_environment() + vector_index_client = VectorIndexClient("cpu", db_dir) vector_index = vector_index_client.create_index( "bert-base-uncased", data_handler.utterances_train, data_handler.labels_train ) diff --git a/tests/modules/prediction/test_treshold.py b/tests/modules/prediction/test_treshold.py index f4052553..3c602aae 100644 --- a/tests/modules/prediction/test_treshold.py +++ b/tests/modules/prediction/test_treshold.py @@ -2,6 +2,7 @@ from autointent.context.data_handler import DataHandler from autointent.modules import KNNScorer, ThresholdPredictor +from tests.conftest import setup_environment def get_fit_data(db_dir, dataset): @@ -21,31 +22,31 @@ def get_fit_data(db_dir, dataset): return scores, labels -def test_predict_returns_correct_indices(setup_environment, dataset): - get_db_dir, dump_dir, logs_dir = setup_environment +def test_predict_returns_correct_indices(dataset): + db_dir, dump_dir, logs_dir = setup_environment() predictor = ThresholdPredictor(0.5) - predictor.fit(*get_fit_data(get_db_dir(), dataset)) + predictor.fit(*get_fit_data(db_dir, dataset)) scores = np.array([[0.1, 0.9], [0.8, 0.2], [0.3, 0.7]]) predictions = predictor.predict(scores) np.testing.assert_array_equal(predictions, np.array([1, 0, 1])) -def test_predict_returns_list(setup_environment, dataset): - get_db_dir, dump_dir, logs_dir = setup_environment +def test_predict_returns_list(dataset): + db_dir, dump_dir, logs_dir = setup_environment() predictor = ThresholdPredictor(np.array([0.5, 0.5, 0.5]), n_classes=3) - predictor.fit(*get_fit_data(get_db_dir(), dataset)) + predictor.fit(*get_fit_data(db_dir, dataset)) scores = np.array([[0.1, 0.9], [0.8, 0.2], [0.3, 0.7]]) predictions = predictor.predict(scores) np.testing.assert_array_equal(predictions, np.array([1, 0, 1])) -def test_predict_handles_single_class(setup_environment, dataset): - get_db_dir, dump_dir, logs_dir = setup_environment +def test_predict_handles_single_class(dataset): + db_dir, dump_dir, logs_dir = setup_environment() predictor = ThresholdPredictor(0.5) - predictor.fit(*get_fit_data(get_db_dir(), dataset)) + predictor.fit(*get_fit_data(db_dir, dataset)) scores = np.array([[0.5], [0.5], [0.5]]) predictions = predictor.predict(scores) np.testing.assert_array_equal(predictions, np.array([0, 0, 0])) diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py index dbc4b099..645019b6 100644 --- a/tests/modules/scoring/test_description.py +++ b/tests/modules/scoring/test_description.py @@ -3,6 +3,7 @@ from autointent.context.data_handler import DataHandler from autointent.modules import DescriptionScorer +from tests.conftest import setup_environment @pytest.mark.parametrize( @@ -12,11 +13,11 @@ ([[0.2, 0.3, 0.2], [0.2, 0.3, 0.2]], False), ], ) -def test_description_scorer(setup_environment, dataset, expected_prediction, multilabel): - db_dir, dump_dir, logs_dir = setup_environment +def test_description_scorer(dataset, expected_prediction, multilabel): + db_dir, dump_dir, logs_dir = setup_environment() data_handler = DataHandler(dataset, force_multilabel=multilabel) - scorer = DescriptionScorer(model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir(), temperature=0.3) + scorer = DescriptionScorer(model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir, temperature=0.3) scorer.fit(data_handler.utterances_train, data_handler.labels_train, data_handler.label_description) assert scorer.description_vectors.shape[0] == len(data_handler.label_description) diff --git a/tests/modules/scoring/test_dnnc.py b/tests/modules/scoring/test_dnnc.py index d01d3bba..efadda20 100644 --- a/tests/modules/scoring/test_dnnc.py +++ b/tests/modules/scoring/test_dnnc.py @@ -3,12 +3,13 @@ from autointent.context.data_handler import DataHandler from autointent.modules import DNNCScorer +from tests.conftest import setup_environment @pytest.mark.xfail(reason="This test is failing on windows, because have different score") @pytest.mark.parametrize(("train_head", "pred_score"), [(True, 1), (False, 0.5)]) -def test_base_dnnc(setup_environment, dataset, train_head, pred_score): - db_dir, dump_dir, logs_dir = setup_environment +def test_base_dnnc(dataset, train_head, pred_score): + db_dir, dump_dir, logs_dir = setup_environment() data_handler = DataHandler(dataset) diff --git a/tests/modules/scoring/test_knn.py b/tests/modules/scoring/test_knn.py index 02797a3d..f2be73ab 100644 --- a/tests/modules/scoring/test_knn.py +++ b/tests/modules/scoring/test_knn.py @@ -2,14 +2,15 @@ from autointent.context.data_handler import DataHandler from autointent.modules import KNNScorer +from tests.conftest import setup_environment -def test_base_knn(setup_environment, dataset): - db_dir, dump_dir, logs_dir = setup_environment +def test_base_knn(dataset): + db_dir, dump_dir, logs_dir = setup_environment() data_handler = DataHandler(dataset) - scorer = KNNScorer(k=3, weights="distance", model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir()) + scorer = KNNScorer(k=3, weights="distance", model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir) scorer.fit(data_handler.utterances_train, data_handler.labels_train) predictions = scorer.predict( diff --git a/tests/modules/scoring/test_linear.py b/tests/modules/scoring/test_linear.py index 8a7fb9be..9ac62182 100644 --- a/tests/modules/scoring/test_linear.py +++ b/tests/modules/scoring/test_linear.py @@ -2,10 +2,11 @@ from autointent.context.data_handler import DataHandler from autointent.modules import LinearScorer +from tests.conftest import setup_environment -def test_base_linear(setup_environment, dataset): - get_db_dir, dump_dir, logs_dir = setup_environment +def test_base_linear(dataset): + get_db_dir, dump_dir, logs_dir = setup_environment() data_handler = DataHandler(dataset) diff --git a/tests/modules/scoring/test_mlknn.py b/tests/modules/scoring/test_mlknn.py index e74de00a..c76318f1 100644 --- a/tests/modules/scoring/test_mlknn.py +++ b/tests/modules/scoring/test_mlknn.py @@ -2,10 +2,11 @@ from autointent.context.data_handler import DataHandler, Dataset from autointent.modules.scoring.mlknn.mlknn import MLKnnScorer +from tests.conftest import setup_environment -def test_base_mlknn(setup_environment, dataset): - db_dir, dump_dir, logs_dir = setup_environment +def test_base_mlknn(dataset): + db_dir, dump_dir, logs_dir = setup_environment() test_dataset = Dataset.model_validate( { @@ -23,7 +24,7 @@ def test_base_mlknn(setup_environment, dataset): ) data_handler = DataHandler(dataset, test_dataset, force_multilabel=True) - scorer = MLKnnScorer(db_dir=db_dir(), k=3, model_name="sergeyzh/rubert-tiny-turbo") + scorer = MLKnnScorer(db_dir=db_dir, k=3, model_name="sergeyzh/rubert-tiny-turbo") scorer.fit(data_handler.utterances_train, data_handler.labels_train) predictions = scorer.predict_labels( diff --git a/tests/modules/test_regex.py b/tests/modules/test_regex.py index 8138651c..e73501cb 100644 --- a/tests/modules/test_regex.py +++ b/tests/modules/test_regex.py @@ -4,11 +4,12 @@ from autointent.context.data_handler import Dataset from autointent.metrics import retrieval_hit_rate, scoring_roc_auc from autointent.modules import RegExp, VectorDBModule +from tests.conftest import setup_environment @pytest.mark.xfail(reason="Issues with intent_id") -def test_base_regex(setup_environment): - db_dir, dump_dir, logs_dir = setup_environment +def test_base_regex(): + db_dir, dump_dir, logs_dir = setup_environment() data = { "utterances": [ @@ -72,7 +73,7 @@ def test_base_regex(setup_environment): context = Context( dataset=Dataset.model_validate(data), dump_dir=dump_dir, - db_dir=db_dir(), + db_dir=db_dir, ) retrieval_params = {"k": 3, "model_name": "sergeyzh/rubert-tiny-turbo"} diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index cdcbdd4b..62b75c2e 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -1,7 +1,9 @@ import pytest from autointent import Context +from autointent.configs.optimization_cli import DataConfig, LoggingConfig, VectorIndexConfig from autointent.nodes.optimization import NodeOptimizer +from tests.conftest import setup_environment @pytest.fixture @@ -68,10 +70,14 @@ def scoring_optimizer_multilabel(context, retrieval_optimizer_multilabel): @pytest.fixture -def context(setup_environment, dataset): - db_dir, dump_dir, logs_dir = setup_environment +def context(dataset_path): + db_dir, dump_dir, logs_dir = setup_environment() def _context(multilabel: bool): - return Context(dataset=dataset, db_dir=db_dir(), dump_dir=dump_dir, force_multilabel=multilabel) + res = Context() + res.config_data(DataConfig(dataset_path, force_multilabel=multilabel)) + res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir)) + res.config_vector_index(VectorIndexConfig(db_dir=db_dir)) + return res return _context diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 6824a5a2..5e0908a5 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -3,7 +3,6 @@ import pytest -from autointent import Context from autointent.configs.optimization_cli import ( DataConfig, LoggingConfig, @@ -11,9 +10,9 @@ TaskConfig, VectorIndexConfig, ) -from autointent.pipeline import PipelineOptimizer from autointent.pipeline.optimization.cli_endpoint import main as optimize_pipeline from autointent.pipeline.optimization.utils import load_config +from tests.conftest import setup_environment ConfigType = Literal["multiclass", "multilabel"] @@ -27,30 +26,12 @@ def _get_config(config_type: ConfigType): return _get_config -@pytest.mark.parametrize( - "config_type", - ["multiclass", "multilabel"], -) -def test_full_pipeline(setup_environment, get_config, dataset, config_type: ConfigType): - db_dir, dump_dir, logs_dir = setup_environment - - context = Context(dataset=dataset, db_dir=db_dir(), dump_dir=dump_dir, force_multilabel=config_type == "multilabel") - - # run optimization - search_space_config = get_config(config_type) - pipeline = PipelineOptimizer.from_dict_config(search_space_config) - pipeline.optimize(context) - - # save results - pipeline.dump(logs_dir=logs_dir) - - @pytest.mark.parametrize( "dataset_type", ["multiclass", "multilabel", "description"], ) -def test_optimization_pipeline_cli(dataset_type, setup_environment): - db_dir, dump_dir, logs_dir = setup_environment +def test_optimization_pipeline_cli(dataset_type): + db_dir, dump_dir, logs_dir = setup_environment() config = OptimizationConfig( data=DataConfig( train_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"), From a118e27a868c1adcfe0d1dd1edb04e2b16b6fc2d Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 13:04:39 +0300 Subject: [PATCH 04/21] fix typing --- autointent/context/context.py | 9 ++++++++- autointent/modules/prediction/base.py | 4 ++-- autointent/pipeline/inference/cli_endpoint.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/autointent/context/context.py b/autointent/context/context.py index 23186518..1164bf32 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -91,6 +91,9 @@ def dump(self) -> None: optimization_results = self.optimization_info.dump_evaluation_results() logs_dir = self.logging_config.dirpath + if logs_dir is None: + msg = "something's wrong with LoggingConfig" + raise ValueError(msg) # create appropriate directory logs_dir.mkdir(parents=True, exist_ok=True) @@ -135,7 +138,11 @@ def get_max_length(self) -> int | None: return self.vector_index_client.embedder_max_length def get_dump_dir(self) -> Path: - return self.logging_config.dump_dir + res = self.logging_config.dump_dir + if res is None: + msg = "something's wrong with LoggingConfig" + raise ValueError(msg) + return res def is_multilabel(self) -> bool: return self.data_handler.multilabel diff --git a/autointent/modules/prediction/base.py b/autointent/modules/prediction/base.py index 17b8de50..6681c0fe 100644 --- a/autointent/modules/prediction/base.py +++ b/autointent/modules/prediction/base.py @@ -52,8 +52,8 @@ def get_prediction_evaluation_data( return_scores = scores if oos_scores is not None: oos_labels = ( - [[0] * context.get_n_classes()] * len(oos_scores) if context.is_multilabel() else [-1] * len(oos_scores) - ) # type: ignore[list-item] + [[0] * context.get_n_classes()] * len(oos_scores) if context.is_multilabel() else [-1] * len(oos_scores) # type: ignore[list-item] + ) labels = np.concatenate([labels, np.array(oos_labels)]) return_scores = np.concatenate([scores, oos_scores]) diff --git a/autointent/pipeline/inference/cli_endpoint.py b/autointent/pipeline/inference/cli_endpoint.py index 257d5f98..690fe8a5 100644 --- a/autointent/pipeline/inference/cli_endpoint.py +++ b/autointent/pipeline/inference/cli_endpoint.py @@ -7,7 +7,7 @@ import yaml from autointent.configs.inference_cli import InferenceConfig -from autointent.pipeline.optimization.utils import NumpyEncoder +from autointent.context.utils import NumpyEncoder from .inference_pipeline import InferencePipeline From 0e3fe2a7e4c96228a6f64968ed2da1f3cb0e0e22 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 14:17:19 +0300 Subject: [PATCH 05/21] add `Context.set_datasets` and allow not dumping modules --- autointent/configs/optimization_cli.py | 1 + autointent/context/context.py | 15 +++++----- .../context/optimization_info/data_models.py | 2 +- .../nodes/optimization/node_optimizer.py | 30 +++++++++---------- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py index 576ba64f..ca1d9471 100644 --- a/autointent/configs/optimization_cli.py +++ b/autointent/configs/optimization_cli.py @@ -28,6 +28,7 @@ class LoggingConfig: run_name: str | None = None dirpath: Path | None = None dump_dir: Path | None = None + dump_modules: bool = True def __post_init__(self) -> None: self.define_run_name() diff --git a/autointent/context/context.py b/autointent/context/context.py index 1164bf32..652751d1 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -14,7 +14,7 @@ VectorIndexConfig, ) -from .data_handler import DataAugmenter, DataHandler +from .data_handler import DataAugmenter, DataHandler, Dataset from .optimization_info import OptimizationInfo from .utils import NumpyEncoder, load_data from .vector_index_client import VectorIndex, VectorIndexClient @@ -68,6 +68,9 @@ def config_data(self, config: DataConfig, augmentation_config: AugmentationConfi augmenter=augmenter, ) + def set_datasets(self, train_data: Dataset, val_data: Dataset | None = None) -> None: + self.data_handler = DataHandler(dataset=train_data, test_dataset=val_data, random_seed=self.seed) + def get_best_index(self) -> VectorIndex: model_name = self.optimization_info.get_best_embedder() return self.vector_index_client.get_index(model_name) @@ -137,12 +140,10 @@ def get_batch_size(self) -> int: def get_max_length(self) -> int | None: return self.vector_index_client.embedder_max_length - def get_dump_dir(self) -> Path: - res = self.logging_config.dump_dir - if res is None: - msg = "something's wrong with LoggingConfig" - raise ValueError(msg) - return res + def get_dump_dir(self) -> Path | None: + if self.logging_config.dump_modules: + return self.logging_config.dump_dir + return None def is_multilabel(self) -> bool: return self.data_handler.multilabel diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py index c74dc089..8a0e0248 100644 --- a/autointent/context/optimization_info/data_models.py +++ b/autointent/context/optimization_info/data_models.py @@ -77,7 +77,7 @@ class Trial(BaseModel): module_params: dict[str, Any] metric_name: str metric_value: float - module_dump_dir: str + module_dump_dir: str | None class Trials(BaseModel): diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index 77c94652..26bd2db8 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -10,7 +10,9 @@ from typing_extensions import Self from autointent.configs.node import NodeOptimizerConfig +from autointent.configs.optimization_cli import LoggingConfig from autointent.context import Context +from autointent.context.data_handler import Dataset from autointent.modules import Module from autointent.modules.prediction.base import get_prediction_evaluation_data from autointent.nodes.nodes_info import NODES_INFO @@ -46,8 +48,12 @@ def fit(self, context: Context) -> None: metric_value = module.score(context, self.node_info.metrics_available[self.metric_name]) assets = module.get_assets() - module_dump_dir = self.get_module_dump_dir(context.get_dump_dir(), module_type, j_combination) - module.dump(module_dump_dir) + + dump_dir = context.get_dump_dir() + + if dump_dir is not None: + dump_dir = self.get_module_dump_dir(dump_dir, module_type, j_combination) + module.dump(dump_dir) context.optimization_info.log_module_optimization( self.node_info.node_type, @@ -56,7 +62,7 @@ def fit(self, context: Context) -> None: metric_value, self.metric_name, assets, # retriever name / scores / predictions - module_dump_dir, + dump_dir, ) module.clear_cache() @@ -89,15 +95,9 @@ def module_fit(self, module: Module, context: Context) -> None: raise ValueError(msg) module.fit(*args) # type: ignore[arg-type] - # @overload - # def fit( - # self, - # utterances: list[str], - # labels: list[LabelType], - # tags: list[Tag] | None = None, - # label_descriptions: list[str] | None = None, - # ) -> None: - # # create context object from given data - # ... - - # # call fit(context) + def fit_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> None: + context = Context() + context.set_datasets(train_data, val_data) + context.config_logs(LoggingConfig(dump_dir=None)) + + self.fit(context) From c4e15d905d38886f12ef67a235788fe34d306ad8 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 17:35:46 +0300 Subject: [PATCH 06/21] implement `PipelineOptimizer.fit_from_dataset` --- .../nodes/optimization/node_optimizer.py | 9 - .../optimization/pipeline_optimizer.py | 12 + .../data/test_data.json | 366 ++++++ .../data/train_data.json | 1055 +++++++++++++++++ .../python-node-optimization/testbed.ipynb | 109 ++ 5 files changed, 1542 insertions(+), 9 deletions(-) create mode 100644 experiments/python-node-optimization/data/test_data.json create mode 100644 experiments/python-node-optimization/data/train_data.json create mode 100644 experiments/python-node-optimization/testbed.ipynb diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index 26bd2db8..f542fac1 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -10,9 +10,7 @@ from typing_extensions import Self from autointent.configs.node import NodeOptimizerConfig -from autointent.configs.optimization_cli import LoggingConfig from autointent.context import Context -from autointent.context.data_handler import Dataset from autointent.modules import Module from autointent.modules.prediction.base import get_prediction_evaluation_data from autointent.nodes.nodes_info import NODES_INFO @@ -94,10 +92,3 @@ def module_fit(self, module: Module, context: Context) -> None: self._logger.error(msg) raise ValueError(msg) module.fit(*args) # type: ignore[arg-type] - - def fit_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> None: - context = Context() - context.set_datasets(train_data, val_data) - context.config_logs(LoggingConfig(dump_dir=None)) - - self.fit(context) diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index 5125b8c5..54d9aad1 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -6,7 +6,9 @@ from hydra.utils import instantiate from autointent import Context +from autointent.configs.optimization_cli import EmbedderConfig, LoggingConfig, VectorIndexConfig from autointent.configs.pipeline_optimizer import PipelineOptimizerConfig +from autointent.context.data_handler import Dataset from autointent.nodes import NodeOptimizer @@ -25,6 +27,16 @@ def optimize(self, context: Context) -> None: for node_optimizer in self.nodes: node_optimizer.fit(context) + def optimize_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> Context: + context = Context() + context.set_datasets(train_data, val_data) + context.config_logs(LoggingConfig(dump_dir=None)) + context.config_vector_index(VectorIndexConfig(), EmbedderConfig()) + + self.optimize(context) + self.inference_config = context.optimization_info.get_inference_nodes_config() + return context + def make_report(logs: dict[str, Any], nodes: list[str]) -> str: ids = [np.argmax(logs["metrics"][node]) for node in nodes] diff --git a/experiments/python-node-optimization/data/test_data.json b/experiments/python-node-optimization/data/test_data.json new file mode 100644 index 00000000..615da45c --- /dev/null +++ b/experiments/python-node-optimization/data/test_data.json @@ -0,0 +1,366 @@ +{ + "utterances": [ + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "can you give me a moderately priced restaurant", + "label": [ + 6 + ] + }, + { + "text": "what area is it in", + "label": [ + 10 + ] + }, + { + "text": "thank you and good bye", + "label": [ + 2 + ] + }, + { + "text": "yes im looking for a traditional restaurant in the expensive price range", + "label": [ + 1, + 6 + ] + }, + { + "text": "im trying to find a vegetarian restaurant and i dont care regarding the price range", + "label": [ + 6 + ] + }, + { + "text": "chesterton", + "label": [ + 6 + ] + }, + { + "text": "does it have a television", + "label": [ + 10 + ] + }, + { + "text": "and what is the address and phone number", + "label": [ + 10 + ] + }, + { + "text": "thank you goodbye", + "label": [ + 2 + ] + }, + { + "text": "number", + "label": [ + 10 + ] + }, + { + "text": "im looking for a pub with and internet connection", + "label": [ + 6 + ] + }, + { + "text": "price", + "label": [ + 10 + ] + }, + { + "text": "no no", + "label": [ + 7 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "end of system audio no im looking for a seafood restaurant", + "label": [ + 6, + 7 + ] + }, + { + "text": "ok what is the address phone number and price", + "label": [ + 0, + 10 + ] + }, + { + "text": "yeah lets have you got anything in the mediterranean food in the area", + "label": [ + 1, + 6 + ] + }, + { + "text": "what about any other area", + "label": [ + 9 + ] + }, + { + "text": "whats the uh thank you and goodbye", + "label": [ + 2, + 12 + ] + }, + { + "text": "next type of food cherry hinton area", + "label": [ + 6, + 9 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "can you select me another venue", + "label": [ + 9 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "ok thank you goodbye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "noise ah hi i am looking for an", + "label": [ + 5 + ] + }, + { + "text": "okay and uh", + "label": [ + 0 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "hi im looking for a pub having internet connection and have a tv", + "label": [ + 5, + 6 + ] + }, + { + "text": "ok thank you", + "label": [ + 0, + 12 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "what about mediterranean", + "label": [ + 6, + 9 + ] + }, + { + "text": "hi im looking for a mediterranean restaurant in the rosemary area", + "label": [ + 5, + 6 + ] + }, + { + "text": "alright thank you good bye", + "label": [ + 2, + 12 + ] + }, + { + "text": "ok i need the phone number and the area", + "label": [ + 0, + 10 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "is it in the kings hedge area", + "label": [ + 3 + ] + }, + { + "text": "what about contemporary restaurant", + "label": [ + 6, + 9 + ] + }, + { + "text": "hi im looking for a contemporary restaurant and is it should be free", + "label": [ + 3 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "is it in the city center jesus christ this is ridiculous", + "label": [ + 3 + ] + }, + { + "text": "is it burger gourmet", + "label": [ + 3 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "um is it free", + "label": [ + 3 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "hello i am looking for cheap restaurant in addenbrookes area", + "label": [ + 5, + 6 + ] + }, + { + "text": "can we start again", + "label": [ + 11 + ] + }, + { + "text": "hi im looking for cafe", + "label": [ + 5, + 6 + ] + }, + { + "text": "not spanish food not spanish food fast", + "label": [ + 4 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "say again", + "label": [ + 8 + ] + }, + { + "text": "start over", + "label": [ + 11 + ] + }, + { + "text": "uh cheap or moderate", + "label": [ + 4 + ] + }, + { + "text": "id like to start over please", + "label": [ + 11 + ] + }, + { + "text": "i need a pub not bakers", + "label": [ + 4, + 6 + ] + } + ] +} \ No newline at end of file diff --git a/experiments/python-node-optimization/data/train_data.json b/experiments/python-node-optimization/data/train_data.json new file mode 100644 index 00000000..2c9ac4cc --- /dev/null +++ b/experiments/python-node-optimization/data/train_data.json @@ -0,0 +1,1055 @@ +{ + "utterances": [ + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "vegetarian", + "label": [ + 6 + ] + }, + { + "text": "what is the eagles address", + "label": [ + 6, + 10 + ] + }, + { + "text": "telephone", + "label": [ + 10 + ] + }, + { + "text": "post code", + "label": [ + 10 + ] + }, + { + "text": "i dont care", + "label": [ + 6 + ] + }, + { + "text": "hi im looking for a pub", + "label": [ + 5, + 6 + ] + }, + { + "text": "what is the address phone number and postcode", + "label": [ + 10 + ] + }, + { + "text": "may i have the address and prices", + "label": [ + 10 + ] + }, + { + "text": "price", + "label": [ + 10 + ] + }, + { + "text": "restaurant", + "label": [ + 6 + ] + }, + { + "text": "and whats the post code", + "label": [ + 10 + ] + }, + { + "text": "i want to find a restaurant in kings hedges", + "label": [ + 6 + ] + }, + { + "text": "i would like japanese food", + "label": [ + 6 + ] + }, + { + "text": "yes i would like to know about a restaurant", + "label": [ + 1, + 6 + ] + }, + { + "text": "any price range", + "label": [ + 6 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "a restaurant in kings hedges", + "label": [ + 6 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "uh what are some other eareas", + "label": [ + 9 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "i dont care", + "label": [ + 6 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "castle hill area", + "label": [ + 6 + ] + }, + { + "text": "moderate", + "label": [ + 6 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "ya any part of the avenue is fine", + "label": [ + 6 + ] + }, + { + "text": "im looking for a japanese restaurant", + "label": [ + 6 + ] + }, + { + "text": "and what is the price of the venue", + "label": [ + 10 + ] + }, + { + "text": "what is the type of food", + "label": [ + 10 + ] + }, + { + "text": "ok what does it have a television", + "label": [ + 10 + ] + }, + { + "text": "whats the phone number", + "label": [ + 10 + ] + }, + { + "text": "uhm address", + "label": [ + 10 + ] + }, + { + "text": "does it have internet connection", + "label": [ + 10 + ] + }, + { + "text": "does it have a television", + "label": [ + 10 + ] + }, + { + "text": "and the phone number", + "label": [ + 10 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "yes please", + "label": [ + 1 + ] + }, + { + "text": "no particular venue", + "label": [ + 6, + 7 + ] + }, + { + "text": "goodbye", + "label": [ + 2 + ] + }, + { + "text": "yeah hi uh i want to find an internet connection and must have a tv", + "label": [ + 1, + 6 + ] + }, + { + "text": "breathing ok thank you goodbye", + "label": [ + 2, + 12 + ] + }, + { + "text": "thank you goodbye", + "label": [ + 2 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "next choice", + "label": [ + 9 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "thank you good bye", + "label": [ + 2 + ] + }, + { + "text": "thank you goodbye", + "label": [ + 2 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "how about indian", + "label": [ + 6, + 9 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "yes im looking for a moderately priced restaurant and it should be in the addenbrookes area", + "label": [ + 1, + 6 + ] + }, + { + "text": "no any part cheap food", + "label": [ + 6, + 7 + ] + }, + { + "text": "okay thank you goodbye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "thank you", + "label": [ + 12 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "no thank you goodbye", + "label": [ + 2, + 7, + 12 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "ah hi ah i am looking for a thia restaurant", + "label": [ + 5, + 6 + ] + }, + { + "text": "thank you", + "label": [ + 12 + ] + }, + { + "text": "yes i am", + "label": [ + 1 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "okay what is the", + "label": [ + 0 + ] + }, + { + "text": "ok thank you what is the phone number and post code", + "label": [ + 0, + 10, + 12 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "ok unintelligible", + "label": [ + 0 + ] + }, + { + "text": "ok can i get an address phone number and post code please", + "label": [ + 0, + 10 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "ok and a", + "label": [ + 0 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "yes i am looking for a restaurant", + "label": [ + 1, + 6 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "no i want the postcode", + "label": [ + 7, + 10 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "ok whats the address phone number and post code", + "label": [ + 0, + 10 + ] + }, + { + "text": "breathing thank you good bye", + "label": [ + 2, + 12 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "yes i would like to know about a restaurant", + "label": [ + 1, + 6 + ] + }, + { + "text": "yes", + "label": [ + 1 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "what else do you have", + "label": [ + 9 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "ok thank you goodbye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "is it in the cheap price range", + "label": [ + 3 + ] + }, + { + "text": "no", + "label": [ + 7 + ] + }, + { + "text": "ok can you get me the phone number and the what type of food unintelligible", + "label": [ + 0, + 10 + ] + }, + { + "text": "any other pubs", + "label": [ + 6, + 9 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "clicking thank you good bye", + "label": [ + 2, + 12 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "ok and whats the phone number", + "label": [ + 0, + 10 + ] + }, + { + "text": "hi i am looking for an ex", + "label": [ + 5 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "ok thank you good bye", + "label": [ + 0, + 2, + 12 + ] + }, + { + "text": "is it located in addenbrookes area", + "label": [ + 3 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "is it a chinese takeaway restaurant", + "label": [ + 3 + ] + }, + { + "text": "say again", + "label": [ + 8 + ] + }, + { + "text": "is it cheap price range", + "label": [ + 3 + ] + }, + { + "text": "ya hi ah i want to find a fusion restaurant", + "label": [ + 5, + 6 + ] + }, + { + "text": "does the bakers have a television", + "label": [ + 3, + 10 + ] + }, + { + "text": "are there any other pubs", + "label": [ + 6, + 9 + ] + }, + { + "text": "ah does it have a moderate price range", + "label": [ + 3 + ] + }, + { + "text": "pick a different area", + "label": [ + 9 + ] + }, + { + "text": "breathing do you have anything in newnham area", + "label": [ + 6, + 9 + ] + }, + { + "text": "is it located in the castlehill area", + "label": [ + 3 + ] + }, + { + "text": "start over", + "label": [ + 11 + ] + }, + { + "text": "is it a thia restaurant", + "label": [ + 3 + ] + }, + { + "text": "are there any other options", + "label": [ + 9 + ] + }, + { + "text": "is it in the addenbrookes area", + "label": [ + 3 + ] + }, + { + "text": "how about castle hill", + "label": [ + 6, + 9 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "hi im looking for a fusion restaurant", + "label": [ + 5, + 6 + ] + }, + { + "text": "is it in the city center", + "label": [ + 3 + ] + }, + { + "text": "no is it in fenditton", + "label": [ + 3, + 7 + ] + }, + { + "text": "does it serve vegetarian food", + "label": [ + 3 + ] + }, + { + "text": "ah what about asian food", + "label": [ + 6, + 9 + ] + }, + { + "text": "is it in the cheap price range", + "label": [ + 3 + ] + }, + { + "text": "what else do you have", + "label": [ + 9 + ] + }, + { + "text": "do you have any others", + "label": [ + 9 + ] + }, + { + "text": "is it in the trumington area", + "label": [ + 3 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "noise is it in the moderate price range", + "label": [ + 3 + ] + }, + { + "text": "hi im looking for a fusion restaurant", + "label": [ + 5, + 6 + ] + }, + { + "text": "how about the fenditton area", + "label": [ + 6, + 9 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "hi im looking for a cheap restaurant in the fenditton area", + "label": [ + 5, + 6 + ] + }, + { + "text": "hi im looking for a restaurant in fenditton", + "label": [ + 5, + 6 + ] + }, + { + "text": "hi im looking for a cheap restaurant in the girton area", + "label": [ + 5, + 6 + ] + }, + { + "text": "hello", + "label": [ + 5 + ] + }, + { + "text": "any kind of food what about any kind of food", + "label": [ + 6, + 9 + ] + }, + { + "text": "hi im looking for an english restaurant", + "label": [ + 5, + 6 + ] + }, + { + "text": "hello", + "label": [ + 5 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "could you repeat", + "label": [ + 8 + ] + }, + { + "text": "hi im looking for a restaurant in fen ditton", + "label": [ + 5, + 6 + ] + }, + { + "text": "hi im looking for a fusion restaurant", + "label": [ + 5, + 6 + ] + }, + { + "text": "hi im looking for a restaurnt in the barnwell area", + "label": [ + 5, + 6 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "im sorry could you repeat that", + "label": [ + 8 + ] + }, + { + "text": "could you repeat that please", + "label": [ + 8 + ] + }, + { + "text": "start over", + "label": [ + 11 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "search again", + "label": [ + 8 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "repeat", + "label": [ + 8 + ] + }, + { + "text": "i didnt ask for a moderate price range i need a pub with internet and tv", + "label": [ + 4, + 6 + ] + }, + { + "text": "may i restart", + "label": [ + 11 + ] + }, + { + "text": "start over", + "label": [ + 11 + ] + }, + { + "text": "lets start over please", + "label": [ + 11 + ] + }, + { + "text": "not so expensive price range", + "label": [ + 4 + ] + }, + { + "text": "less expensive", + "label": [ + 4 + ] + }, + { + "text": "um cheap cheap venue not indian", + "label": [ + 4, + 6 + ] + }, + { + "text": "can i start over", + "label": [ + 11 + ] + }, + { + "text": "lets start over", + "label": [ + 11 + ] + }, + { + "text": "lets not go to japan", + "label": [ + 4 + ] + }, + { + "text": "start over", + "label": [ + 11 + ] + }, + { + "text": "less expensive", + "label": [ + 4 + ] + }, + { + "text": "not free moderate", + "label": [ + 4, + 6 + ] + } + ] +} \ No newline at end of file diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb new file mode 100644 index 00000000..b445de59 --- /dev/null +++ b/experiments/python-node-optimization/testbed.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(165, 57)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from autointent.context.data_handler import Dataset\n", + "from autointent.context.utils import load_data\n", + "\n", + "scoring_dataset = load_data(\"./data/train_data.json\")\n", + "prediction_dataset = load_data(\"./data/test_data.json\")\n", + "len(scoring_dataset.utterances), len(prediction_dataset.utterances)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from autointent.pipeline.optimization import PipelineOptimizer\n", + "\n", + "config = {\n", + " \"nodes\": [\n", + " {\n", + " \"node_type\": \"scoring\",\n", + " \"metric\": \"scoring_roc_auc\",\n", + " \"search_space\": [\n", + " {\"module_type\": \"knn\", \"k\": [5, 10], \"weights\": [\"uniform\", \"distance\", \"closest\"], \"model_name\": [\"avsolatorio/GIST-small-Embedding-v0\"]},\n", + " {\"module_type\": \"linear\", \"model_name\": [\"avsolatorio/GIST-small-Embedding-v0\"]},\n", + " # {\n", + " # \"module_type\": \"dnnc\",\n", + " # \"cross_encoder_name\": [\"cross-encoder/ms-marco-MiniLM-L-6-v2\", \"avsolatorio/GIST-small-Embedding-v0\"],\n", + " # \"search_model_name\": [\"avsolatorio/GIST-small-Embedding-v0\"],\n", + " # \"k\": [1, 3],\n", + " # \"train_head\": [False, True],\n", + " # },\n", + " ],\n", + " },\n", + " {\n", + " \"node_type\": \"prediction\",\n", + " \"metric\": \"prediction_accuracy\",\n", + " \"search_space\": [\n", + " {\"module_type\": \"threshold\", \"thresh\": [0.5]},\n", + " {\"module_type\": \"tunable\"},\n", + " # {\"module_type\": \"argmax\"},\n", + " # {\"module_type\": \"jinoos\"},\n", + " ],\n", + " },\n", + " ]\n", + "}\n", + "\n", + "pipeline_optimizer = PipelineOptimizer.from_dict_config(config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.dump()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autointent-D7M6VOhJ-py3.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 109d47e784b89182139f3ff685078e4b8d32d487 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 18:11:42 +0300 Subject: [PATCH 07/21] enable configuration for python api --- .../optimization/pipeline_optimizer.py | 27 ++++++- .../python-node-optimization/testbed.ipynb | 70 ++++++++++++++++++- 2 files changed, 91 insertions(+), 6 deletions(-) diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index 54d9aad1..f3fb7726 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -13,10 +13,31 @@ class PipelineOptimizer: - def __init__(self, nodes: list[NodeOptimizer]) -> None: + def __init__( + self, + nodes: list[NodeOptimizer], + ) -> None: self._logger = logging.getLogger(__name__) self.nodes = nodes + self.logging_config = LoggingConfig(dump_dir=None) + self.vector_index_config = VectorIndexConfig() + self.embedder_config = EmbedderConfig() + + def set_config( + self, + config: LoggingConfig | VectorIndexConfig | EmbedderConfig + ) -> None: + if isinstance(config, LoggingConfig): + self.logging_config = config + elif isinstance(config, VectorIndexConfig): + self.vector_index_config = config + elif isinstance(config, EmbedderConfig): + self.embedder_config = config + else: + msg = "unknown config type" + raise TypeError(msg) + @classmethod def from_dict_config(cls, config: dict[str, Any]) -> "PipelineOptimizer": return instantiate(PipelineOptimizerConfig, **config) # type: ignore[no-any-return] @@ -30,8 +51,8 @@ def optimize(self, context: Context) -> None: def optimize_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> Context: context = Context() context.set_datasets(train_data, val_data) - context.config_logs(LoggingConfig(dump_dir=None)) - context.config_vector_index(VectorIndexConfig(), EmbedderConfig()) + context.config_logs(self.logging_config) + context.config_vector_index(self.vector_index_config, self.embedder_config) self.optimize(context) self.inference_config = context.optimization_info.get_inference_nodes_config() diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb index b445de59..12962095 100644 --- a/experiments/python-node-optimization/testbed.ipynb +++ b/experiments/python-node-optimization/testbed.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Simpler Pipeline Optimization Demo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load datasets" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -25,9 +39,16 @@ "len(scoring_dataset.utterances), len(prediction_dataset.utterances)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define Search Space" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -66,18 +87,61 @@ "pipeline_optimizer = PipelineOptimizer.from_dict_config(config)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Optional] Configure Your Run" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], + "source": [ + "from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig\n", + "from pathlib import Path\n", + "\n", + "pipeline_optimizer.set_config(LoggingConfig(run_name=\"sweet_cucumber\", dirpath=Path(\".\").resolve(), dump_modules=False))\n", + "pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(\"./my_vector_db\").resolve(), device=\"cuda\"))\n", + "pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[I 2024-11-05 18:08:17,123] A new study created in memory with name: no-name-5066322d-4fcd-4a17-8699-c3670e71e698\n" + ] + } + ], "source": [ "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Logs" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ From 2b0d371e615a85e350ab24c0353fc8457de6e2e3 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 18:14:41 +0300 Subject: [PATCH 08/21] fix typing --- autointent/configs/node.py | 2 +- autointent/context/optimization_info/optimization_info.py | 2 +- autointent/nodes/optimization/node_optimizer.py | 8 +++++--- autointent/pipeline/optimization/pipeline_optimizer.py | 5 +---- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/autointent/configs/node.py b/autointent/configs/node.py index f3c41b99..e811fea6 100644 --- a/autointent/configs/node.py +++ b/autointent/configs/node.py @@ -9,7 +9,7 @@ class InferenceNodeConfig: node_type: str = MISSING module_type: str = MISSING module_config: dict[str, Any] = MISSING - load_path: str = MISSING + load_path: str | None = None _target_: str = "autointent.nodes.InferenceNode" diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 3cfa2a72..7bee5ea7 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -29,7 +29,7 @@ def log_module_optimization( metric_value: float, metric_name: str, artifact: Artifact, - module_dump_dir: str, + module_dump_dir: str | None, ) -> None: """ Purposes: diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index f542fac1..897cbf6d 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -50,8 +50,10 @@ def fit(self, context: Context) -> None: dump_dir = context.get_dump_dir() if dump_dir is not None: - dump_dir = self.get_module_dump_dir(dump_dir, module_type, j_combination) - module.dump(dump_dir) + module_dump_dir = self.get_module_dump_dir(dump_dir, module_type, j_combination) + module.dump(module_dump_dir) + else: + module_dump_dir = None context.optimization_info.log_module_optimization( self.node_info.node_type, @@ -60,7 +62,7 @@ def fit(self, context: Context) -> None: metric_value, self.metric_name, assets, # retriever name / scores / predictions - dump_dir, + module_dump_dir, ) module.clear_cache() diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index f3fb7726..d0a521c1 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -24,10 +24,7 @@ def __init__( self.vector_index_config = VectorIndexConfig() self.embedder_config = EmbedderConfig() - def set_config( - self, - config: LoggingConfig | VectorIndexConfig | EmbedderConfig - ) -> None: + def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig) -> None: if isinstance(config, LoggingConfig): self.logging_config = config elif isinstance(config, VectorIndexConfig): From d7c4066760440d5420aacd6e1d935eeed426b05c Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 5 Nov 2024 18:45:12 +0300 Subject: [PATCH 09/21] fix tests --- autointent/context/context.py | 8 +++- .../optimization/pipeline_optimizer.py | 6 ++- .../python-node-optimization/testbed.ipynb | 2 +- tests/pipeline/test_optimization.py | 45 ++++++++++++++----- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/autointent/context/context.py b/autointent/context/context.py index 652751d1..9dabf70e 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -68,8 +68,12 @@ def config_data(self, config: DataConfig, augmentation_config: AugmentationConfi augmenter=augmenter, ) - def set_datasets(self, train_data: Dataset, val_data: Dataset | None = None) -> None: - self.data_handler = DataHandler(dataset=train_data, test_dataset=val_data, random_seed=self.seed) + def set_datasets( + self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False + ) -> None: + self.data_handler = DataHandler( + dataset=train_data, test_dataset=val_data, random_seed=self.seed, force_multilabel=force_multilabel + ) def get_best_index(self) -> VectorIndex: model_name = self.optimization_info.get_best_embedder() diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index d0a521c1..3552dad0 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -45,9 +45,11 @@ def optimize(self, context: Context) -> None: for node_optimizer in self.nodes: node_optimizer.fit(context) - def optimize_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> Context: + def optimize_from_dataset( + self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False + ) -> Context: context = Context() - context.set_datasets(train_data, val_data) + context.set_datasets(train_data, val_data, force_multilabel) context.config_logs(self.logging_config) context.config_vector_index(self.vector_index_config, self.embedder_config) diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb index 12962095..5c0d6842 100644 --- a/experiments/python-node-optimization/testbed.ipynb +++ b/experiments/python-node-optimization/testbed.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 5e0908a5..4e22ccf9 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -1,50 +1,71 @@ import importlib.resources as ires +from pathlib import Path from typing import Literal import pytest from autointent.configs.optimization_cli import ( DataConfig, + EmbedderConfig, LoggingConfig, OptimizationConfig, TaskConfig, VectorIndexConfig, ) +from autointent.pipeline.optimization import PipelineOptimizer from autointent.pipeline.optimization.cli_endpoint import main as optimize_pipeline from autointent.pipeline.optimization.utils import load_config from tests.conftest import setup_environment -ConfigType = Literal["multiclass", "multilabel"] +TaskType = Literal["multiclass", "multilabel", "description"] -@pytest.fixture -def get_config(): - def _get_config(config_type: ConfigType): - config_path = ires.files("tests.assets.configs").joinpath(f"{config_type}.yaml") - return load_config(str(config_path), multilabel=config_type == "multilabel") +def get_search_space_path(task_type: TaskType): + return ires.files("tests.assets.configs").joinpath(f"{task_type}.yaml") - return _get_config + +def get_search_space(task_type: TaskType): + path = get_search_space_path(task_type) + return load_config(str(path), multilabel=task_type == "multilabel") + + +@pytest.mark.parametrize( + "task_type", + ["multiclass", "multilabel", "description"], +) +def test_no_context_optimization(dataset, task_type): + db_dir, dump_dir, logs_dir = setup_environment() + search_space = get_search_space(task_type) + + pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space) + + pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False)) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu")) + pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) + + context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) + context.dump() @pytest.mark.parametrize( - "dataset_type", + "task_type", ["multiclass", "multilabel", "description"], ) -def test_optimization_pipeline_cli(dataset_type): +def test_optimization_pipeline_cli(task_type): db_dir, dump_dir, logs_dir = setup_environment() config = OptimizationConfig( data=DataConfig( train_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"), - force_multilabel=(dataset_type == "multilabel"), + force_multilabel=(task_type == "multilabel"), ), task=TaskConfig( - search_space_path=ires.files("tests.assets.configs").joinpath(f"{dataset_type}.yaml"), + search_space_path=get_search_space_path(task_type), ), vector_index=VectorIndexConfig( device="cpu", ), logs=LoggingConfig( - dirpath=logs_dir, + dirpath=Path(logs_dir), ), ) optimize_pipeline(config) From d305bb5dd574d238868bab25d1a2326484d37224 Mon Sep 17 00:00:00 2001 From: voorhs Date: Wed, 6 Nov 2024 11:56:11 +0300 Subject: [PATCH 10/21] add `clear_ram` option --- autointent/configs/optimization_cli.py | 3 ++- autointent/context/context.py | 3 +++ autointent/context/optimization_info/data_models.py | 12 ++++++++++++ .../context/optimization_info/optimization_info.py | 9 ++++++++- autointent/nodes/optimization/node_optimizer.py | 8 +++++--- 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py index ca1d9471..2d3ea290 100644 --- a/autointent/configs/optimization_cli.py +++ b/autointent/configs/optimization_cli.py @@ -28,7 +28,8 @@ class LoggingConfig: run_name: str | None = None dirpath: Path | None = None dump_dir: Path | None = None - dump_modules: bool = True + dump_modules: bool = False + clear_ram: bool = True def __post_init__(self) -> None: self.define_run_name() diff --git a/autointent/context/context.py b/autointent/context/context.py index 9dabf70e..dcd46e46 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -154,3 +154,6 @@ def is_multilabel(self) -> bool: def get_n_classes(self) -> int: return self.data_handler.n_classes + + def is_ram_to_clear(self) -> bool: + return self.logging_config.clear_ram diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py index 8a0e0248..83e10931 100644 --- a/autointent/context/optimization_info/data_models.py +++ b/autointent/context/optimization_info/data_models.py @@ -4,6 +4,8 @@ from numpy.typing import NDArray from pydantic import BaseModel, ConfigDict, Field +# from autointent.modules.base import str + class Artifact(BaseModel): ... @@ -115,3 +117,13 @@ def get_best_trial_idx(self, node_type: str) -> int | None: def set_best_trial_idx(self, node_type: str, idx: int) -> None: setattr(self, validate_node_name(node_type), idx) + + +class ModulesList(BaseModel): + regexp: list[str] = [] + retrieval: list[str] = [] + scoring: list[str] = [] + prediction: list[str] = [] + + def get(self, node_type: str) -> list[str]: + return getattr(self, node_type) diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 7bee5ea7..027cc2de 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -4,8 +4,9 @@ from numpy.typing import NDArray from autointent.configs.node import InferenceNodeConfig +# from autointent.modules.base import Module -from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds +from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds from .logger import get_logger @@ -20,6 +21,7 @@ def __init__(self) -> None: self.artifacts = Artifacts() self.trials = Trials() self._trials_best_ids = TrialsIds() + self.modules = ModulesList() def log_module_optimization( self, @@ -30,6 +32,7 @@ def log_module_optimization( metric_name: str, artifact: Artifact, module_dump_dir: str | None, + module = None, ) -> None: """ Purposes: @@ -48,6 +51,10 @@ def log_module_optimization( self.trials.add_trial(node_type, trial) self._logger.info(trial.model_dump()) + # save module + if module is not None: + self.modules.get(node_type).append(module) + # save artifact self.artifacts.add_artifact(node_type, artifact) diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index 897cbf6d..b281e01b 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -63,11 +63,13 @@ def fit(self, context: Context) -> None: self.metric_name, assets, # retriever name / scores / predictions module_dump_dir, + module=module if not context.is_ram_to_clear() else None ) - module.clear_cache() - gc.collect() - torch.cuda.empty_cache() + if context.is_ram_to_clear(): + module.clear_cache() + gc.collect() + torch.cuda.empty_cache() self._logger.info("%s node optimization is finished!", self.node_info.node_type) From d648849efb406870af80c0d13b1f76875ec2500b Mon Sep 17 00:00:00 2001 From: voorhs Date: Wed, 6 Nov 2024 13:14:39 +0300 Subject: [PATCH 11/21] infering modules from ram after optimization --- autointent/context/context.py | 4 + .../optimization_info/optimization_info.py | 13 +- autointent/nodes/inference/inference_node.py | 19 ++- .../pipeline/inference/inference_pipeline.py | 25 ++- .../python-node-optimization/testbed.ipynb | 161 ++++++++++++++++-- 5 files changed, 191 insertions(+), 31 deletions(-) diff --git a/autointent/context/context.py b/autointent/context/context.py index dcd46e46..4e67a614 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -157,3 +157,7 @@ def get_n_classes(self) -> int: def is_ram_to_clear(self) -> bool: return self.logging_config.clear_ram + + def has_saved_modules(self) -> bool: + node_types = ["regexp", "retrieval", "scoring", "prediction"] + return any(len(self.optimization_info.modules.get(nt)) > 0 for nt in node_types) diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 027cc2de..b96d831b 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -4,8 +4,8 @@ from numpy.typing import NDArray from autointent.configs.node import InferenceNodeConfig -# from autointent.modules.base import Module +# from autointent.modules.base import Module from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds from .logger import get_logger @@ -116,3 +116,14 @@ def get_inference_nodes_config(self) -> list[InferenceNodeConfig]: ) ) return res + + def _get_best_module(self, node_type: str): + idx = self._get_best_trial_idx(node_type) + if idx is not None: + return self.modules.get(node_type)[idx] + return None + + def get_best_modules(self): + node_types = ["regexp", "retrieval", "scoring", "prediction"] + res = {nt: self._get_best_module(nt) for nt in node_types} + return {nt: m for nt, m in res.items() if m is not None} diff --git a/autointent/nodes/inference/inference_node.py b/autointent/nodes/inference/inference_node.py index 9549f576..361fd92a 100644 --- a/autointent/nodes/inference/inference_node.py +++ b/autointent/nodes/inference/inference_node.py @@ -1,22 +1,25 @@ import gc -from typing import Any import torch -from hydra.utils import instantiate from autointent.configs.node import InferenceNodeConfig +from autointent.modules.base import Module from autointent.nodes.nodes_info import NODES_INFO class InferenceNode: - def __init__(self, node_type: str, module_type: str, module_config: dict[str, Any], load_path: str) -> None: - self.node_info = NODES_INFO[node_type] - self.module = self.node_info.modules_available[module_type](**module_config) - self.module.load(load_path) + def __init__(self, module: Module, node_type: str) -> None: + self.module = module + self.node_type = node_type @classmethod - def from_dict_config(cls, config: dict[str, Any]) -> "InferenceNode": - return instantiate(InferenceNodeConfig, **config) # type: ignore[no-any-return] + def from_config( + cls, config: InferenceNodeConfig + ) -> "InferenceNode": + node_info = NODES_INFO[config.node_type] + module = node_info.modules_available[config.module_type](**config.module_config) + module.load(config.load_path) + return cls(module, config.node_type) def clear_cache(self) -> None: self.module.clear_cache() diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py index abcd2fda..372ae729 100644 --- a/autointent/pipeline/inference/inference_pipeline.py +++ b/autointent/pipeline/inference/inference_pipeline.py @@ -1,19 +1,17 @@ -from typing import Any - -from hydra.utils import instantiate - -from autointent.configs.inference_pipeline import InferencePipelineConfig +from autointent.configs.node import InferenceNodeConfig +from autointent.context import Context from autointent.custom_types import LabelType from autointent.nodes.inference import InferenceNode class InferencePipeline: def __init__(self, nodes: list[InferenceNode]) -> None: - self.nodes = {node.node_info.node_type: node for node in nodes} + self.nodes = {n.node_type: n for n in nodes} @classmethod - def from_dict_config(cls, config: dict[str, Any]) -> "InferencePipeline": - return instantiate(InferencePipelineConfig, **config) # type: ignore[no-any-return] + def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> None: + nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs] + return cls(nodes) def predict(self, utterances: list[str]) -> list[LabelType]: scores = self.nodes["scoring"].module.predict(utterances) @@ -21,3 +19,14 @@ def predict(self, utterances: list[str]) -> list[LabelType]: def fit(self, utterances: list[str], labels: list[LabelType]) -> None: pass + + @classmethod + def from_context(cls, context: Context) -> "InferencePipeline": + if not context.has_saved_modules(): + config = context.optimization_info.get_inference_nodes_config() + return cls.from_config(config) + nodes = [ + InferenceNode(module, node_type) + for node_type, module in context.optimization_info.get_best_modules().items() + ] + return InferencePipeline(nodes) diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb index 5c0d6842..95cc6649 100644 --- a/experiments/python-node-optimization/testbed.ipynb +++ b/experiments/python-node-optimization/testbed.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -11,12 +21,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Load datasets" + "## Replicate full-fledged optimization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load datasets" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -25,7 +42,7 @@ "(165, 57)" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -43,12 +60,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Define Search Space" + "### Define Search Space" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -91,19 +108,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## [Optional] Configure Your Run" + "### [Optional] Configure Your Run" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig\n", "from pathlib import Path\n", "\n", - "pipeline_optimizer.set_config(LoggingConfig(run_name=\"sweet_cucumber\", dirpath=Path(\".\").resolve(), dump_modules=False))\n", + "pipeline_optimizer.set_config(LoggingConfig(run_name=\"sweet_cucumber\", dirpath=Path.cwd(), dump_modules=True, clear_ram=True))\n", "pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(\"./my_vector_db\").resolve(), device=\"cuda\"))\n", "pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))" ] @@ -112,41 +129,157 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Run Optimization" + "### Run Optimization" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[I 2024-11-05 18:08:17,123] A new study created in memory with name: no-name-5066322d-4fcd-4a17-8699-c3670e71e698\n" + "[I 2024-11-06 13:10:03,339] A new study created in memory with name: no-name-85c71fe7-cc94-448b-a9a0-46470688fb6b\n" ] } ], "source": [ - "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test" + "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Save Logs" + "### Save Logs" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "context.dump()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from autointent.pipeline.inference import InferencePipeline\n", + "\n", + "inference_pipeline = InferencePipeline.from_context(context)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inference_pipeline.predict([\"hello world\", \"what is the eagles address\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## No modules dumping" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], + "source": [ + "! rm -rf sweet_cucumber*" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_optimizer.set_config(LoggingConfig(dump_modules=False, clear_ram=False))\n", + "pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(\"./my_vector_db\").resolve(), device=\"cuda\"))\n", + "pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "inference_pipeline = InferencePipeline.from_context(context)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inference_pipeline.predict([\"hello world\", \"what is the eagles address\"])" + ] } ], "metadata": { From e7d0fbd86942116dcfe467ba0eaa1733c4756700 Mon Sep 17 00:00:00 2001 From: voorhs Date: Wed, 6 Nov 2024 13:34:09 +0300 Subject: [PATCH 12/21] minor change --- .../pipeline/inference/inference_pipeline.py | 2 +- .../python-node-optimization/testbed.ipynb | 54 +++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py index 372ae729..c6926426 100644 --- a/autointent/pipeline/inference/inference_pipeline.py +++ b/autointent/pipeline/inference/inference_pipeline.py @@ -9,7 +9,7 @@ def __init__(self, nodes: list[InferenceNode]) -> None: self.nodes = {n.node_type: n for n in nodes} @classmethod - def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> None: + def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "InferencePipeline": nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs] return cls(nodes) diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb index 95cc6649..746eb49f 100644 --- a/experiments/python-node-optimization/testbed.ipynb +++ b/experiments/python-node-optimization/testbed.ipynb @@ -141,7 +141,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "[I 2024-11-06 13:10:03,339] A new study created in memory with name: no-name-85c71fe7-cc94-448b-a9a0-46470688fb6b\n" + "[I 2024-11-06 13:31:52,764] A new study created in memory with name: no-name-4b9b4940-4128-414d-8733-3236f7935735\n" ] } ], @@ -165,16 +165,64 @@ "context.dump()" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "inference_config = context.optimization_info.get_inference_nodes_config()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run Inference from file system" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from autointent.pipeline.inference import InferencePipeline\n", + "\n", + "inference_pipeline = InferencePipeline.from_config(inference_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inference_pipeline.predict([\"hello world\", \"what is the eagles address\"])" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Run Inference" + "## Run Inference from context [but from file system]" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ From 975c8dfa50ede43c81367e094b2ee063cfd16877 Mon Sep 17 00:00:00 2001 From: voorhs Date: Wed, 6 Nov 2024 13:46:27 +0300 Subject: [PATCH 13/21] fix unintended `runs` directory creation --- autointent/configs/optimization_cli.py | 1 - autointent/context/optimization_info/optimization_info.py | 3 +-- autointent/nodes/inference/inference_node.py | 4 +--- autointent/nodes/optimization/node_optimizer.py | 2 +- autointent/pipeline/optimization/utils/__init__.py | 3 +-- autointent/pipeline/optimization/utils/cli.py | 8 -------- 6 files changed, 4 insertions(+), 17 deletions(-) diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py index 2d3ea290..080b602c 100644 --- a/autointent/configs/optimization_cli.py +++ b/autointent/configs/optimization_cli.py @@ -46,7 +46,6 @@ def define_dirpath(self) -> None: if self.run_name is None: raise ValueError self.dirpath = dirpath / self.run_name - self.dirpath.mkdir(parents=True) def define_dump_dir(self) -> None: if self.dump_dir is None: diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index b96d831b..bce20945 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -5,7 +5,6 @@ from autointent.configs.node import InferenceNodeConfig -# from autointent.modules.base import Module from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds from .logger import get_logger @@ -32,7 +31,7 @@ def log_module_optimization( metric_name: str, artifact: Artifact, module_dump_dir: str | None, - module = None, + module=None, ) -> None: """ Purposes: diff --git a/autointent/nodes/inference/inference_node.py b/autointent/nodes/inference/inference_node.py index 361fd92a..705fb526 100644 --- a/autointent/nodes/inference/inference_node.py +++ b/autointent/nodes/inference/inference_node.py @@ -13,9 +13,7 @@ def __init__(self, module: Module, node_type: str) -> None: self.node_type = node_type @classmethod - def from_config( - cls, config: InferenceNodeConfig - ) -> "InferenceNode": + def from_config(cls, config: InferenceNodeConfig) -> "InferenceNode": node_info = NODES_INFO[config.node_type] module = node_info.modules_available[config.module_type](**config.module_config) module.load(config.load_path) diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index b281e01b..2a483556 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -63,7 +63,7 @@ def fit(self, context: Context) -> None: self.metric_name, assets, # retriever name / scores / predictions module_dump_dir, - module=module if not context.is_ram_to_clear() else None + module=module if not context.is_ram_to_clear() else None, ) if context.is_ram_to_clear(): diff --git a/autointent/pipeline/optimization/utils/__init__.py b/autointent/pipeline/optimization/utils/__init__.py index e0aa17b6..2a948bd8 100644 --- a/autointent/pipeline/optimization/utils/__init__.py +++ b/autointent/pipeline/optimization/utils/__init__.py @@ -1,6 +1,5 @@ -from .cli import get_logs_dir, load_config +from .cli import load_config __all__ = [ "load_config", - "get_logs_dir", ] diff --git a/autointent/pipeline/optimization/utils/cli.py b/autointent/pipeline/optimization/utils/cli.py index 2cd4e911..7ceb8967 100644 --- a/autointent/pipeline/optimization/utils/cli.py +++ b/autointent/pipeline/optimization/utils/cli.py @@ -6,14 +6,6 @@ import yaml -def get_logs_dir(run_name: str, logs_dir: Path | None = None) -> Path: - if logs_dir is None: - logs_dir = Path.cwd() - res = logs_dir / run_name - res.mkdir(parents=True) - return res - - def load_config(config_path: str | Path | None, multilabel: bool, logger: Logger | None = None) -> dict[str, Any]: """load config from the given path or load default config which is distributed along with the autointent package""" if config_path is not None: From 378e582c977b778ca869d192c3f974929b9fcabb Mon Sep 17 00:00:00 2001 From: voorhs Date: Wed, 6 Nov 2024 14:03:23 +0300 Subject: [PATCH 14/21] add `save_db` option --- autointent/configs/optimization_cli.py | 1 + .../context/vector_index_client/cache.py | 97 +------------------ .../vector_index_client.py | 3 + .../optimization/pipeline_optimizer.py | 2 + 4 files changed, 8 insertions(+), 95 deletions(-) diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py index 080b602c..e9f74cfd 100644 --- a/autointent/configs/optimization_cli.py +++ b/autointent/configs/optimization_cli.py @@ -58,6 +58,7 @@ def define_dump_dir(self) -> None: class VectorIndexConfig: db_dir: Path | None = None device: str = "cpu" + save_db: bool = False @dataclass diff --git a/autointent/context/vector_index_client/cache.py b/autointent/context/vector_index_client/cache.py index 843d2de2..663dc79e 100644 --- a/autointent/context/vector_index_client/cache.py +++ b/autointent/context/vector_index_client/cache.py @@ -1,71 +1,6 @@ -import json -import logging -import shutil -from dataclasses import asdict, dataclass, field from pathlib import Path from uuid import uuid4 -from appdirs import user_cache_dir, user_config_dir - - -def get_logger() -> logging.Logger: - logger = logging.getLogger("my_logger") - - logger.setLevel(logging.INFO) - - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - - formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - ch.setFormatter(formatter) - - logger.addHandler(ch) - - return logger - - -@dataclass -class ChromaConfig: - cache_directories: list[str] = field(default_factory=list) - - -def get_chroma_cache_dir() -> Path: - """Get system's default cache dir.""" - cache_dir = user_cache_dir("autointent") - return Path(cache_dir) / "chroma" - - -def get_chroma_config_path() -> Path: - """Get system's default config dir.""" - config_dir = user_config_dir("autointent") - return Path(config_dir) / "chromadb.json" - - -def read_chroma_config() -> ChromaConfig: - path = get_chroma_config_path() - if not path.exists(): - return ChromaConfig() - with path.open() as file: - return ChromaConfig(**json.load(file)) - - -def write_chroma_config(config: ChromaConfig) -> None: - path = get_chroma_config_path() - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w") as file: - json.dump(asdict(config), file, ensure_ascii=False, indent=4) - - -def add_cache_directory(directory: str) -> None: - """Save path into chroma config in order to remove it from cache later.""" - chroma_config = read_chroma_config() - - directories = set(chroma_config.cache_directories) - directories.add(directory) - chroma_config.cache_directories = sorted(directories) - - write_chroma_config(chroma_config) - def get_db_dir(db_dir: str | Path | None = None) -> Path: """ @@ -74,36 +9,8 @@ def get_db_dir(db_dir: str | Path | None = None) -> Path: Save path into user config in order to remove it from cache later. """ - db_dir = get_chroma_cache_dir() / str(uuid4()) if db_dir is None else Path(db_dir) + root = Path(db_dir) if db_dir is not None else Path.cwd() + db_dir = root / "vector_db" / str(uuid4()) if db_dir is None else Path(db_dir) db_dir.mkdir(parents=True, exist_ok=True) - add_cache_directory(str(db_dir.resolve())) return db_dir - - -def clear_chroma_cache() -> None: - # TODO: test on all platforms - logger = get_logger() - chroma_config = read_chroma_config() - for cache_dirs in chroma_config.cache_directories: - if Path(cache_dirs).exists(): - shutil.rmtree(cache_dirs) - logger.info("cleared vector index at %s", cache_dirs) - else: - logger.error("vector index at %s not found", cache_dirs) - chroma_config.cache_directories.remove(cache_dirs) - write_chroma_config(chroma_config) - - -def clear_specific_cache(directory: str) -> None: - """TODO test this code""" - chroma_config = read_chroma_config() - if directory in chroma_config.cache_directories: - try: - shutil.rmtree(directory) - chroma_config.cache_directories.remove(directory) - write_chroma_config(chroma_config) - except OSError: - pass - else: - pass diff --git a/autointent/context/vector_index_client/vector_index_client.py b/autointent/context/vector_index_client/vector_index_client.py index 19a79816..1cb83801 100644 --- a/autointent/context/vector_index_client/vector_index_client.py +++ b/autointent/context/vector_index_client/vector_index_client.py @@ -108,6 +108,9 @@ def get_index(self, model_name: str) -> VectorIndex: def exists(self, model_name: str) -> bool: return self._get_index_dirpath(model_name) is not None + def delete_db(self) -> None: + shutil.rmtree(self.db_dir) + class NonExistingIndexError(Exception): def __init__(self, message: str = "non-existent index was requested") -> None: diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index 3552dad0..f391e77d 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -44,6 +44,8 @@ def optimize(self, context: Context) -> None: self._logger.info("starting pipeline optimization...") for node_optimizer in self.nodes: node_optimizer.fit(context) + if not context.vector_index_config.save_db: + context.vector_index_client.delete_db() def optimize_from_dataset( self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False From 8c2eaff902601341aacf3097936bb604dd8bc508 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 6 Nov 2024 22:41:50 +0300 Subject: [PATCH 15/21] fix circular imports --- .../context/optimization_info/data_models.py | 10 ------ .../optimization_info/optimization_info.py | 35 +++++++++++++------ autointent/modules/prediction/argmax.py | 1 - autointent/nodes/inference/inference_node.py | 3 +- .../pipeline/inference/inference_pipeline.py | 10 ++---- 5 files changed, 29 insertions(+), 30 deletions(-) diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py index 0f513770..611387a4 100644 --- a/autointent/context/optimization_info/data_models.py +++ b/autointent/context/optimization_info/data_models.py @@ -117,13 +117,3 @@ def get_best_trial_idx(self, node_type: str) -> int | None: def set_best_trial_idx(self, node_type: str, idx: int) -> None: setattr(self, validate_node_name(node_type), idx) - - -class ModulesList(BaseModel): - regexp: list[str] = [] - retrieval: list[str] = [] - scoring: list[str] = [] - prediction: list[str] = [] - - def get(self, node_type: str) -> list[str]: - return getattr(self, node_type) diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 744b7e3d..467ada4f 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -1,15 +1,31 @@ -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np from numpy.typing import NDArray +from pydantic import BaseModel from autointent.configs.node import InferenceNodeConfig from autointent.custom_types import NODE_TYPES, NodeType -from autointent.logger import get_logger -from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds +from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds from .logger import get_logger +if TYPE_CHECKING: + from autointent.modules import Module + + +class ModulesList(BaseModel): + regexp: list["Module"] = [] + retrieval: list["Module"] = [] + scoring: list["Module"] = [] + prediction: list["Module"] = [] + + def get(self, node_type: str) -> list["Module"]: + return getattr(self, node_type) # type: ignore[no-any-return] + + def add_module(self, node_type: str, module: "Module") -> None: + self.get(node_type).append(module) + class OptimizationInfo: """TODO continous IO with file system (to be able to restore the state of optimization)""" @@ -33,7 +49,7 @@ def log_module_optimization( metric_name: str, artifact: Artifact, module_dump_dir: str | None, - module=None, + module: "Module | None" = None, ) -> None: """ Purposes: @@ -53,8 +69,8 @@ def log_module_optimization( self._logger.info(trial.model_dump()) # save module - if module is not None: - self.modules.get(node_type).append(module) + if module: + self.modules.add_module(node_type, module) # save artifact self.artifacts.add_artifact(node_type, artifact) @@ -114,13 +130,12 @@ def get_inference_nodes_config(self) -> list[InferenceNodeConfig]: ) return res - def _get_best_module(self, node_type: str): + def _get_best_module(self, node_type: str) -> "Module | None": idx = self._get_best_trial_idx(node_type) if idx is not None: return self.modules.get(node_type)[idx] return None - def get_best_modules(self): - node_types = ["regexp", "retrieval", "scoring", "prediction"] - res = {nt: self._get_best_module(nt) for nt in node_types} + def get_best_modules(self) -> dict[str, "Module"]: + res = {nt: self._get_best_module(nt) for nt in NODE_TYPES} return {nt: m for nt, m in res.items() if m is not None} diff --git a/autointent/modules/prediction/argmax.py b/autointent/modules/prediction/argmax.py index 5b76a665..57c6baf4 100644 --- a/autointent/modules/prediction/argmax.py +++ b/autointent/modules/prediction/argmax.py @@ -18,7 +18,6 @@ class ArgmaxPredictor(PredictionModule): def __init__(self) -> None: pass - @classmethod def from_context(cls, context: Context) -> Self: return cls() diff --git a/autointent/nodes/inference/inference_node.py b/autointent/nodes/inference/inference_node.py index 705fb526..f73da944 100644 --- a/autointent/nodes/inference/inference_node.py +++ b/autointent/nodes/inference/inference_node.py @@ -16,7 +16,8 @@ def __init__(self, module: Module, node_type: str) -> None: def from_config(cls, config: InferenceNodeConfig) -> "InferenceNode": node_info = NODES_INFO[config.node_type] module = node_info.modules_available[config.module_type](**config.module_config) - module.load(config.load_path) + if config.load_path is not None: + module.load(config.load_path) return cls(module, config.node_type) def clear_cache(self) -> None: diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py index 5af1d995..9b593345 100644 --- a/autointent/pipeline/inference/inference_pipeline.py +++ b/autointent/pipeline/inference/inference_pipeline.py @@ -1,18 +1,12 @@ -from typing import Any - -from hydra.utils import instantiate - -from autointent.configs.inference_pipeline import InferencePipelineConfig -from autointent.custom_types import LabelType, NodeType from autointent.configs.node import InferenceNodeConfig from autointent.context import Context -from autointent.custom_types import LabelType +from autointent.custom_types import LabelType, NodeType from autointent.nodes.inference import InferenceNode class InferencePipeline: def __init__(self, nodes: list[InferenceNode]) -> None: - self.nodes = {node.node_info.node_type: node for node in nodes} + self.nodes = {node.node_type: node for node in nodes} @classmethod def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "InferencePipeline": From 322340bec35624b0350ea693e1eff49e21575992 Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 8 Nov 2024 23:11:19 +0300 Subject: [PATCH 16/21] fix tests --- .../optimization_info/optimization_info.py | 13 +++++---- autointent/pipeline/inference/cli_endpoint.py | 3 +- tests/nodes/conftest.py | 2 +- tests/nodes/test_predicton.py | 29 ++++++++++--------- tests/nodes/test_retrieval.py | 29 ++++++++++--------- tests/nodes/test_scoring.py | 29 ++++++++++--------- 6 files changed, 54 insertions(+), 51 deletions(-) diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 467ada4f..9040d376 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -1,8 +1,8 @@ +from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any import numpy as np from numpy.typing import NDArray -from pydantic import BaseModel from autointent.configs.node import InferenceNodeConfig from autointent.custom_types import NODE_TYPES, NodeType @@ -14,11 +14,12 @@ from autointent.modules import Module -class ModulesList(BaseModel): - regexp: list["Module"] = [] - retrieval: list["Module"] = [] - scoring: list["Module"] = [] - prediction: list["Module"] = [] +@dataclass +class ModulesList: + regexp: list["Module"] = field(default_factory=list) + retrieval: list["Module"] = field(default_factory=list) + scoring: list["Module"] = field(default_factory=list) + prediction: list["Module"] = field(default_factory=list) def get(self, node_type: str) -> list["Module"]: return getattr(self, node_type) # type: ignore[no-any-return] diff --git a/autointent/pipeline/inference/cli_endpoint.py b/autointent/pipeline/inference/cli_endpoint.py index 690fe8a5..ad833517 100644 --- a/autointent/pipeline/inference/cli_endpoint.py +++ b/autointent/pipeline/inference/cli_endpoint.py @@ -29,8 +29,7 @@ def main(cfg: InferenceConfig) -> None: logger.debug("Inference config loaded") # instantiate pipeline - pipeline_config = {"nodes": inference_config["nodes_configs"]} - pipeline = InferencePipeline.from_dict_config(pipeline_config) + pipeline = InferencePipeline.from_config(inference_config["nodes_configs"]) # send data to pipeline labels: list[LabelType] = pipeline.predict(data) diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index 62b75c2e..da221e2e 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -76,7 +76,7 @@ def context(dataset_path): def _context(multilabel: bool): res = Context() res.config_data(DataConfig(dataset_path, force_multilabel=multilabel)) - res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir)) + res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True)) res.config_vector_index(VectorIndexConfig(db_dir=db_dir)) return res diff --git a/tests/nodes/test_predicton.py b/tests/nodes/test_predicton.py index 97434448..9c675434 100644 --- a/tests/nodes/test_predicton.py +++ b/tests/nodes/test_predicton.py @@ -4,6 +4,7 @@ import numpy as np import torch +from autointent.configs.node import InferenceNodeConfig from autointent.nodes import InferenceNode from autointent.nodes.optimization import NodeOptimizer @@ -34,13 +35,13 @@ def test_prediction_multiclass(scoring_optimizer_multiclass): prediction_optimizer.fit(context) for trial in context.optimization_info.trials.prediction: - config = { - "node_type": "prediction", - "module_type": trial.module_type, - "module_config": trial.module_params, - "load_path": trial.module_dump_dir, - } - node = InferenceNode(**config) + config = InferenceNodeConfig( + node_type="prediction", + module_type=trial.module_type, + module_config=trial.module_params, + load_path=trial.module_dump_dir, + ) + node = InferenceNode.from_config(config) node.module.predict(np.array([[0.27486506, 0.31681463, 0.37459106], [0.2769358, 0.31536099, 0.37366978]])) node.module.clear_cache() gc.collect() @@ -65,13 +66,13 @@ def test_prediction_multilabel(scoring_optimizer_multilabel): prediction_optimizer.fit(context) for trial in context.optimization_info.trials.prediction: - config = { - "node_type": "prediction", - "module_type": trial.module_type, - "module_config": trial.module_params, - "load_path": trial.module_dump_dir, - } - node = InferenceNode(**config) + config = InferenceNodeConfig( + node_type="prediction", + module_type=trial.module_type, + module_config=trial.module_params, + load_path=trial.module_dump_dir, + ) + node = InferenceNode.from_config(config) node.module.predict(np.array([[0.27486506, 0.31681463, 0.37459106], [0.2769358, 0.31536099, 0.37366978]])) node.module.clear_cache() gc.collect() diff --git a/tests/nodes/test_retrieval.py b/tests/nodes/test_retrieval.py index 92417eb8..5d625692 100644 --- a/tests/nodes/test_retrieval.py +++ b/tests/nodes/test_retrieval.py @@ -3,6 +3,7 @@ import torch +from autointent.configs.node import InferenceNodeConfig from autointent.nodes import InferenceNode, NodeOptimizer logger = logging.getLogger(__name__) @@ -14,13 +15,13 @@ def test_retrieval_multiclass(context): retrieval_optimizer.fit(context) for trial in context.optimization_info.trials.retrieval: - config = { - "node_type": "retrieval", - "module_type": trial.module_type, - "module_config": trial.module_params, - "load_path": trial.module_dump_dir, - } - node = InferenceNode(**config) + config = InferenceNodeConfig( + node_type="retrieval", + module_type=trial.module_type, + module_config=trial.module_params, + load_path=trial.module_dump_dir, + ) + node = InferenceNode.from_config(config) labels, distances, texts = node.module.predict(["hello", "card"]) node.module.clear_cache() gc.collect() @@ -33,13 +34,13 @@ def test_retrieval_multilabel(context): retrieval_optimizer.fit(context) for trial in context.optimization_info.trials.retrieval: - config = { - "node_type": "retrieval", - "module_type": trial.module_type, - "module_config": trial.module_params, - "load_path": trial.module_dump_dir, - } - node = InferenceNode(**config) + config = InferenceNodeConfig( + node_type="retrieval", + module_type=trial.module_type, + module_config=trial.module_params, + load_path=trial.module_dump_dir, + ) + node = InferenceNode.from_config(config) labels, distances, texts = node.module.predict(["hello", "card"]) node.module.clear_cache() gc.collect() diff --git a/tests/nodes/test_scoring.py b/tests/nodes/test_scoring.py index 79a58ba4..8d037df2 100644 --- a/tests/nodes/test_scoring.py +++ b/tests/nodes/test_scoring.py @@ -3,6 +3,7 @@ import torch +from autointent.configs.node import InferenceNodeConfig from autointent.nodes import InferenceNode from autointent.nodes.optimization import NodeOptimizer @@ -47,13 +48,13 @@ def test_scoring_multiclass(context, retrieval_optimizer_multiclass): scoring_optimizer.fit(context) for trial in context.optimization_info.trials.scoring: - config = { - "node_type": "scoring", - "module_type": trial.module_type, - "module_config": trial.module_params, - "load_path": trial.module_dump_dir, - } - node = InferenceNode(**config) + config = InferenceNodeConfig( + node_type="scoring", + module_type=trial.module_type, + module_config=trial.module_params, + load_path=trial.module_dump_dir, + ) + node = InferenceNode.from_config(config) scores = node.module.predict(["hello", "world"]) # noqa: F841 node.module.clear_cache() gc.collect() @@ -87,13 +88,13 @@ def test_scoring_multilabel(context, retrieval_optimizer_multilabel): scoring_optimizer.fit(context) for trial in context.optimization_info.trials.scoring: - config = { - "node_type": "scoring", - "module_type": trial.module_type, - "module_config": trial.module_params, - "load_path": trial.module_dump_dir, - } - node = InferenceNode(**config) + config = InferenceNodeConfig( + node_type="scoring", + module_type=trial.module_type, + module_config=trial.module_params, + load_path=trial.module_dump_dir, + ) + node = InferenceNode.from_config(config) scores = node.module.predict(["hello", "world"]) # noqa: F841 node.module.clear_cache() gc.collect() From c4873633b3b705932c625e20ed232e6f94ce4ab8 Mon Sep 17 00:00:00 2001 From: Darinka <39233990+Darinochka@users.noreply.github.com> Date: Sat, 9 Nov 2024 10:47:42 +0300 Subject: [PATCH 17/21] Test/pipeline simpler fitting (#39) * tess: added inference_test * test: added inference pipeline cli * test: fixed device * test: added optimization tests * fix `inference_config.yaml` not found error --------- Co-authored-by: voorhs --- .../optimization_info/optimization_info.py | 2 +- autointent/pipeline/inference/cli_endpoint.py | 2 +- .../pipeline/inference/inference_pipeline.py | 8 ++ tests/conftest.py | 4 +- tests/pipeline/test_inference.py | 109 ++++++++++++++++++ tests/pipeline/test_optimization.py | 41 +++++++ 6 files changed, 162 insertions(+), 4 deletions(-) create mode 100644 tests/pipeline/test_inference.py diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py index 9040d376..48e22e8c 100644 --- a/autointent/context/optimization_info/optimization_info.py +++ b/autointent/context/optimization_info/optimization_info.py @@ -123,7 +123,7 @@ def get_inference_nodes_config(self) -> list[InferenceNodeConfig]: trial = self.trials.get_trial(node_type, idx) res.append( InferenceNodeConfig( - node_type=node_type, + node_type=node_type.value, module_type=trial.module_type, module_config=trial.module_params, load_path=trial.module_dump_dir, diff --git a/autointent/pipeline/inference/cli_endpoint.py b/autointent/pipeline/inference/cli_endpoint.py index ad833517..40f5d6b6 100644 --- a/autointent/pipeline/inference/cli_endpoint.py +++ b/autointent/pipeline/inference/cli_endpoint.py @@ -29,7 +29,7 @@ def main(cfg: InferenceConfig) -> None: logger.debug("Inference config loaded") # instantiate pipeline - pipeline = InferencePipeline.from_config(inference_config["nodes_configs"]) + pipeline = InferencePipeline.from_dict_config(inference_config["nodes_configs"]) # send data to pipeline labels: list[LabelType] = pipeline.predict(data) diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py index 9b593345..39c9d73a 100644 --- a/autointent/pipeline/inference/inference_pipeline.py +++ b/autointent/pipeline/inference/inference_pipeline.py @@ -1,3 +1,5 @@ +from typing import Any + from autointent.configs.node import InferenceNodeConfig from autointent.context import Context from autointent.custom_types import LabelType, NodeType @@ -8,6 +10,12 @@ class InferencePipeline: def __init__(self, nodes: list[InferenceNode]) -> None: self.nodes = {node.node_type: node for node in nodes} + @classmethod + def from_dict_config(cls, nodes_configs: list[dict[str, Any]]) -> "InferencePipeline": + nodes_configs_ = [InferenceNodeConfig(**cfg) for cfg in nodes_configs] + nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs_] + return cls(nodes) + @classmethod def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "InferencePipeline": nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs] diff --git a/tests/conftest.py b/tests/conftest.py index 08f95300..af217b98 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,8 +7,8 @@ def setup_environment() -> tuple[str, str]: - logs_dir = ires.files("tests").joinpath("logs") - db_dir = logs_dir / "db" / str(uuid4()) + logs_dir = ires.files("tests").joinpath("logs") / str(uuid4()) + db_dir = logs_dir / "db" dump_dir = logs_dir / "modules_dump" return db_dir, dump_dir, logs_dir diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py new file mode 100644 index 00000000..a04a69c1 --- /dev/null +++ b/tests/pipeline/test_inference.py @@ -0,0 +1,109 @@ +import importlib.resources as ires +from pathlib import Path +from typing import Literal + +import pytest + +from autointent.configs.inference_cli import InferenceConfig +from autointent.configs.optimization_cli import ( + EmbedderConfig, + LoggingConfig, + VectorIndexConfig, +) +from autointent.pipeline.inference import InferencePipeline +from autointent.pipeline.inference.cli_endpoint import main as inference_pipeline +from autointent.pipeline.optimization import PipelineOptimizer +from autointent.pipeline.optimization.utils import load_config +from tests.conftest import setup_environment + +TaskType = Literal["multiclass", "multilabel", "description"] + + +def get_search_space_path(task_type: TaskType): + return ires.files("tests.assets.configs").joinpath(f"{task_type}.yaml") + + +def get_search_space(task_type: TaskType): + path = get_search_space_path(task_type) + return load_config(str(path), multilabel=task_type == "multilabel") + + +@pytest.mark.parametrize( + "task_type", + ["multiclass", "multilabel", "description"], +) +def test_inference_config(dataset, task_type): + db_dir, dump_dir, logs_dir = setup_environment() + search_space = get_search_space(task_type) + + pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space) + + pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=True)) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True)) + pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) + + context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) + inference_config = context.optimization_info.get_inference_nodes_config() + + inference_pipeline = InferencePipeline.from_config(inference_config) + prediction = inference_pipeline.predict(["123", "hello world"]) + if task_type == "multilabel": + assert prediction.shape == (2, len(dataset.intents)) + else: + assert prediction.shape == (2,) + + context.dump() + + +@pytest.mark.parametrize( + "task_type", + ["multiclass", "multilabel", "description"], +) +def test_inference_context(dataset, task_type): + db_dir, dump_dir, logs_dir = setup_environment() + search_space = get_search_space(task_type) + + pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space) + + pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=True)) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True)) + pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) + + context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) + inference_pipeline = InferencePipeline.from_context(context) + prediction = inference_pipeline.predict(["123", "hello world"]) + + if task_type == "multilabel": + assert prediction.shape == (2, len(dataset.intents)) + else: + assert prediction.shape == (2,) + + context.dump() + + +@pytest.mark.parametrize( + "task_type", + ["multiclass", "multilabel", "description"], +) +def test_inference_pipeline_cli(dataset, task_type): + db_dir, dump_dir, logs_dir = setup_environment() + search_space = get_search_space(task_type) + + pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space) + + pipeline_optimizer.set_config( + logging_config := LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True) + ) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cuda", save_db=True)) + pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) + context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) + + context.dump() + + config = InferenceConfig( + data_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"), + source_dir=logging_config.dirpath, + output_path=logging_config.dump_dir, + log_level="CRITICAL", + ) + inference_pipeline(config) diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 4e22ccf9..d34c2e29 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -1,4 +1,5 @@ import importlib.resources as ires +import os from pathlib import Path from typing import Literal @@ -47,6 +48,46 @@ def test_no_context_optimization(dataset, task_type): context.dump() +@pytest.mark.parametrize( + "task_type", + ["multiclass", "multilabel", "description"], +) +def test_save_db(dataset, task_type): + db_dir, dump_dir, logs_dir = setup_environment() + search_space = get_search_space(task_type) + + pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space) + + pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False)) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), save_db=True, device="cpu")) + pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) + + context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) + context.dump() + + assert os.listdir(db_dir) + + +@pytest.mark.parametrize( + "task_type", + ["multiclass", "multilabel", "description"], +) +def test_dump_modules(dataset, task_type): + db_dir, dump_dir, logs_dir = setup_environment() + search_space = get_search_space(task_type) + + pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space) + + pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True)) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu")) + pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) + + context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) + context.dump() + + assert os.listdir(dump_dir) + + @pytest.mark.parametrize( "task_type", ["multiclass", "multilabel", "description"], From a2e4deac11442f69cc33d31c135b80aaadf4585d Mon Sep 17 00:00:00 2001 From: voorhs Date: Sat, 9 Nov 2024 11:25:45 +0300 Subject: [PATCH 18/21] refactor github actions --- .github/workflows/test-inference.yaml | 40 +++++++++++++++++++ .../workflows/{tests.yaml => test-nodes.yaml} | 4 +- .github/workflows/test-optimization.yaml | 40 +++++++++++++++++++ .github/workflows/unit-tests.yaml | 40 +++++++++++++++++++ .../optimization/pipeline_optimizer.py | 1 + tests/pipeline/test_inference.py | 5 ++- 6 files changed, 127 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/test-inference.yaml rename .github/workflows/{tests.yaml => test-nodes.yaml} (93%) create mode 100644 .github/workflows/test-optimization.yaml create mode 100644 .github/workflows/unit-tests.yaml diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml new file mode 100644 index 00000000..a0172287 --- /dev/null +++ b/.github/workflows/test-inference.yaml @@ -0,0 +1,40 @@ +name: integration tests + +on: + push: + branches: + - dev + pull_request: + branches: + - dev + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.10", "3.11", "3.12" ] + include: + - os: windows-latest + python-version: "3.10" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + run: | + pip install . + pip install pytest pytest-asyncio + + - name: Run tests + run: | + pytest tests/pipeline/test_inference.py diff --git a/.github/workflows/tests.yaml b/.github/workflows/test-nodes.yaml similarity index 93% rename from .github/workflows/tests.yaml rename to .github/workflows/test-nodes.yaml index ce0a804d..5304057e 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/test-nodes.yaml @@ -1,4 +1,4 @@ -name: Run Tests +name: integration tests on: push: @@ -37,4 +37,4 @@ jobs: - name: Run tests run: | - pytest + pytest tests/nodes diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml new file mode 100644 index 00000000..43a4e6ed --- /dev/null +++ b/.github/workflows/test-optimization.yaml @@ -0,0 +1,40 @@ +name: integration tests + +on: + push: + branches: + - dev + pull_request: + branches: + - dev + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.10", "3.11", "3.12" ] + include: + - os: windows-latest + python-version: "3.10" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + run: | + pip install . + pip install pytest pytest-asyncio + + - name: Run tests + run: | + pytest tests/pipeline/test_optimization.py diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml new file mode 100644 index 00000000..3612d561 --- /dev/null +++ b/.github/workflows/unit-tests.yaml @@ -0,0 +1,40 @@ +name: unit tests + +on: + push: + branches: + - dev + pull_request: + branches: + - dev + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.10", "3.11", "3.12" ] + include: + - os: windows-latest + python-version: "3.10" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + run: | + pip install . + pip install pytest pytest-asyncio + + - name: Run tests + run: | + pytest --ignore=tests/nodes --ignore=tests/pipeline diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index 2b53a99b..e4629aee 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -46,6 +46,7 @@ def optimize(self, context: Context) -> None: for node_optimizer in self.nodes: node_optimizer.fit(context) if not context.vector_index_config.save_db: + self._logger.info("removing vector database from file system...") context.vector_index_client.delete_db() def optimize_from_dataset( diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py index a04a69c1..273c205b 100644 --- a/tests/pipeline/test_inference.py +++ b/tests/pipeline/test_inference.py @@ -53,6 +53,7 @@ def test_inference_config(dataset, task_type): assert prediction.shape == (2,) context.dump() + context.vector_index_client.delete_db() @pytest.mark.parametrize( @@ -79,6 +80,7 @@ def test_inference_context(dataset, task_type): assert prediction.shape == (2,) context.dump() + context.vector_index_client.delete_db() @pytest.mark.parametrize( @@ -94,7 +96,7 @@ def test_inference_pipeline_cli(dataset, task_type): pipeline_optimizer.set_config( logging_config := LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True) ) - pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cuda", save_db=True)) + pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True)) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32)) context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel")) @@ -107,3 +109,4 @@ def test_inference_pipeline_cli(dataset, task_type): log_level="CRITICAL", ) inference_pipeline(config) + context.vector_index_client.delete_db() From c349f18a307e1484c2b64d8b6fc05fb61009d9de Mon Sep 17 00:00:00 2001 From: voorhs Date: Sat, 9 Nov 2024 11:27:42 +0300 Subject: [PATCH 19/21] rename actions --- .github/workflows/test-inference.yaml | 2 +- .github/workflows/test-nodes.yaml | 2 +- .github/workflows/test-optimization.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml index a0172287..ca4c45ff 100644 --- a/.github/workflows/test-inference.yaml +++ b/.github/workflows/test-inference.yaml @@ -1,4 +1,4 @@ -name: integration tests +name: test inference on: push: diff --git a/.github/workflows/test-nodes.yaml b/.github/workflows/test-nodes.yaml index 5304057e..99507571 100644 --- a/.github/workflows/test-nodes.yaml +++ b/.github/workflows/test-nodes.yaml @@ -1,4 +1,4 @@ -name: integration tests +name: test nodes on: push: diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml index 43a4e6ed..ea1cf861 100644 --- a/.github/workflows/test-optimization.yaml +++ b/.github/workflows/test-optimization.yaml @@ -1,4 +1,4 @@ -name: integration tests +name: test optimization on: push: From b26a878238fdc50aaecd9d948a28220ce64315e1 Mon Sep 17 00:00:00 2001 From: voorhs Date: Sat, 9 Nov 2024 12:55:02 +0300 Subject: [PATCH 20/21] fix `model_name` issue --- .../vector_index_client.py | 2 - .../datafiles/default-multiclass-config.yaml | 2 +- autointent/modules/base.py | 3 + autointent/modules/retrieval/vectordb.py | 14 ++--- .../scoring/description/description.py | 24 ++++--- autointent/modules/scoring/dnnc/dnnc.py | 20 +++--- autointent/modules/scoring/knn/knn.py | 23 ++++--- autointent/modules/scoring/linear.py | 24 ++++--- autointent/modules/scoring/mlknn/mlknn.py | 23 ++++--- .../nodes/optimization/node_optimizer.py | 4 ++ tests/assets/configs/description.yaml | 2 +- tests/assets/configs/multiclass.yaml | 4 +- tests/assets/configs/multilabel.yaml | 2 +- tests/assets/data/utterances.json | 62 +++++++++++++++++++ tests/modules/prediction/test_treshold.py | 2 +- tests/modules/retrieval/test_vectordb.py | 2 +- tests/modules/scoring/test_description.py | 2 +- tests/modules/scoring/test_knn.py | 2 +- tests/modules/scoring/test_mlknn.py | 2 +- tests/modules/test_regex.py | 2 +- tests/nodes/conftest.py | 2 +- tests/nodes/test_retrieval.py | 2 +- tests/nodes/test_scoring.py | 14 ++--- tests/pipeline/test_inference.py | 4 +- 24 files changed, 163 insertions(+), 80 deletions(-) create mode 100644 tests/assets/data/utterances.json diff --git a/autointent/context/vector_index_client/vector_index_client.py b/autointent/context/vector_index_client/vector_index_client.py index 1cb83801..98551eb2 100644 --- a/autointent/context/vector_index_client/vector_index_client.py +++ b/autointent/context/vector_index_client/vector_index_client.py @@ -12,8 +12,6 @@ class VectorIndexClient: - model_name: str - def __init__( self, device: str, diff --git a/autointent/datafiles/default-multiclass-config.yaml b/autointent/datafiles/default-multiclass-config.yaml index ac26d523..34ca64b3 100644 --- a/autointent/datafiles/default-multiclass-config.yaml +++ b/autointent/datafiles/default-multiclass-config.yaml @@ -5,7 +5,7 @@ nodes: search_space: - module_type: vector_db k: [10] - model_name: + embedder_name: - avsolatorio/GIST-small-Embedding-v0 - infgrad/stella-base-en-v2 - node_type: scoring diff --git a/autointent/modules/base.py b/autointent/modules/base.py index 1b41504b..1aaa7eac 100644 --- a/autointent/modules/base.py +++ b/autointent/modules/base.py @@ -52,3 +52,6 @@ def predict(self, *args: list[str] | npt.NDArray[Any], **kwargs: dict[str, Any]) @abstractmethod def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> Self: pass + + def get_embedder_name(self) -> str | None: + return None diff --git a/autointent/modules/retrieval/vectordb.py b/autointent/modules/retrieval/vectordb.py index 92d641b9..b3b30313 100644 --- a/autointent/modules/retrieval/vectordb.py +++ b/autointent/modules/retrieval/vectordb.py @@ -26,7 +26,7 @@ class VectorDBModule(RetrievalModule): def __init__( self, k: int, - model_name: str, + embedder_name: str, db_dir: str | None = None, device: str = "cpu", batch_size: int = 32, @@ -34,7 +34,7 @@ def __init__( ) -> None: if db_dir is None: db_dir = str(get_db_dir()) - self.model_name = model_name + self.embedder_name = embedder_name self.device = device self.db_dir = db_dir self.batch_size = batch_size @@ -47,11 +47,11 @@ def from_context( cls, context: Context, k: int, - model_name: str, + embedder_name: str, ) -> Self: return cls( k=k, - model_name=model_name, + embedder_name=embedder_name, db_dir=str(context.get_db_dir()), device=context.get_device(), batch_size=context.get_batch_size(), @@ -63,7 +63,7 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None: self.device, self.db_dir, embedder_batch_size=self.batch_size, embedder_max_length=self.max_length ) - self.vector_index = vector_index_client.create_index(self.model_name, utterances, labels) + self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels) def score(self, context: Context, metric_fn: RetrievalMetricFn) -> float: labels_pred, _, _ = self.vector_index.query( @@ -73,7 +73,7 @@ def score(self, context: Context, metric_fn: RetrievalMetricFn) -> float: return metric_fn(context.data_handler.labels_test, labels_pred) def get_assets(self) -> RetrieverArtifact: - return RetrieverArtifact(embedder_name=self.model_name) + return RetrieverArtifact(embedder_name=self.embedder_name) def clear_cache(self) -> None: self.vector_index.delete() @@ -101,7 +101,7 @@ def load(self, path: str) -> None: embedder_batch_size=self.metadata["batch_size"], embedder_max_length=self.metadata["max_length"], ) - self.vector_index = vector_index_client.get_index(self.model_name) + self.vector_index = vector_index_client.get_index(self.embedder_name) def predict(self, utterances: list[str]) -> tuple[list[list[int | list[int]]], list[list[float]], list[list[str]]]: """ diff --git a/autointent/modules/scoring/description/description.py b/autointent/modules/scoring/description/description.py index eba80a8d..903a97c8 100644 --- a/autointent/modules/scoring/description/description.py +++ b/autointent/modules/scoring/description/description.py @@ -34,7 +34,7 @@ class DescriptionScorer(ScoringModule): def __init__( self, - model_name: str, + embedder_name: str, db_dir: Path | None = None, temperature: float = 1.0, device: str = "cpu", @@ -46,7 +46,7 @@ def __init__( self.temperature = temperature self.device = device self.db_dir = db_dir - self.model_name = model_name + self.embedder_name = embedder_name self.batch_size = batch_size self.max_length = max_length @@ -55,23 +55,26 @@ def from_context( cls, context: Context, temperature: float, - model_name: str | None = None, + embedder_name: str | None = None, ) -> Self: - if model_name is None: - model_name = context.optimization_info.get_best_embedder() + if embedder_name is None: + embedder_name = context.optimization_info.get_best_embedder() precomputed_embeddings = True else: - precomputed_embeddings = context.vector_index_client.exists(model_name) + precomputed_embeddings = context.vector_index_client.exists(embedder_name) instance = cls( temperature=temperature, device=context.get_device(), db_dir=context.get_db_dir(), - model_name=model_name, + embedder_name=embedder_name, ) instance.precomputed_embeddings = precomputed_embeddings return instance + def get_embedder_name(self) -> str: + return self.embedder_name + def fit( self, utterances: list[str], @@ -88,7 +91,7 @@ def fit( if self.precomputed_embeddings: # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization vector_index_client = VectorIndexClient(self.device, self.db_dir, self.batch_size, self.max_length) - vector_index = vector_index_client.get_index(self.model_name) + vector_index = vector_index_client.get_index(self.embedder_name) features = vector_index.get_all_embeddings() if len(features) != len(utterances): msg = "Vector index mismatches provided utterances" @@ -96,7 +99,10 @@ def fit( embedder = vector_index.embedder else: embedder = Embedder( - device=self.device, model_name=self.model_name, batch_size=self.batch_size, max_length=self.max_length + device=self.device, + model_name=self.embedder_name, + batch_size=self.batch_size, + max_length=self.max_length, ) features = embedder.embed(utterances) diff --git a/autointent/modules/scoring/dnnc/dnnc.py b/autointent/modules/scoring/dnnc/dnnc.py index 950c3247..d6395c0f 100644 --- a/autointent/modules/scoring/dnnc/dnnc.py +++ b/autointent/modules/scoring/dnnc/dnnc.py @@ -44,7 +44,7 @@ class DNNCScorer(ScoringModule): def __init__( self, cross_encoder_name: str, - search_model_name: str, + embedder_name: str, k: int, db_dir: str | None = None, device: str = "cpu", @@ -56,7 +56,7 @@ def __init__( db_dir = str(get_db_dir()) self.cross_encoder_name = cross_encoder_name - self.search_model_name = search_model_name + self.embedder_name = embedder_name self.k = k self.train_head = train_head self.device = device @@ -70,18 +70,18 @@ def from_context( context: Context, cross_encoder_name: str, k: int, - search_model_name: str | None = None, + embedder_name: str | None = None, train_head: bool = False, ) -> Self: - if search_model_name is None: - search_model_name = context.optimization_info.get_best_embedder() + if embedder_name is None: + embedder_name = context.optimization_info.get_best_embedder() prebuilt_index = True else: - prebuilt_index = context.vector_index_client.exists(search_model_name) + prebuilt_index = context.vector_index_client.exists(embedder_name) instance = cls( cross_encoder_name=cross_encoder_name, - search_model_name=search_model_name, + embedder_name=embedder_name, k=k, train_head=train_head, device=context.get_device(), @@ -101,12 +101,12 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None: if self.prebuilt_index: # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization - self.vector_index = vector_index_client.get_index(self.search_model_name) + self.vector_index = vector_index_client.get_index(self.embedder_name) if len(utterances) != len(self.vector_index.texts): msg = "Vector index mismatches provided utterances" raise ValueError(msg) else: - self.vector_index = vector_index_client.create_index(self.search_model_name, utterances, labels) + self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels) if self.train_head: model = CrossEncoderWithLogreg(self.model) @@ -207,7 +207,7 @@ def load(self, path: str) -> None: embedder_batch_size=self.metadata["batch_size"], embedder_max_length=self.metadata["max_length"], ) - self.vector_index = vector_index_client.get_index(self.search_model_name) + self.vector_index = vector_index_client.get_index(self.embedder_name) crossencoder_dir = str(dump_dir / self.crossencoder_subdir) if self.train_head: diff --git a/autointent/modules/scoring/knn/knn.py b/autointent/modules/scoring/knn/knn.py index d6fcd236..6d99ce54 100644 --- a/autointent/modules/scoring/knn/knn.py +++ b/autointent/modules/scoring/knn/knn.py @@ -31,7 +31,7 @@ class KNNScorer(ScoringModule): def __init__( self, - model_name: str, + embedder_name: str, k: int, weights: WEIGHT_TYPES, db_dir: str | None = None, @@ -51,7 +51,7 @@ def __init__( """ if db_dir is None: db_dir = str(get_db_dir()) - self.model_name = model_name + self.embedder_name = embedder_name self.k = k self.weights = weights self.db_dir = db_dir @@ -65,16 +65,16 @@ def from_context( context: Context, k: int, weights: WEIGHT_TYPES, - model_name: str | None = None, + embedder_name: str | None = None, ) -> Self: - if model_name is None: - model_name = context.optimization_info.get_best_embedder() + if embedder_name is None: + embedder_name = context.optimization_info.get_best_embedder() prebuilt_index = True else: - prebuilt_index = context.vector_index_client.exists(model_name) + prebuilt_index = context.vector_index_client.exists(embedder_name) instance = cls( - model_name=model_name, + embedder_name=embedder_name, k=k, weights=weights, db_dir=str(context.get_db_dir()), @@ -85,6 +85,9 @@ def from_context( instance.prebuilt_index = prebuilt_index return instance + def get_embedder_name(self) -> str: + return self.embedder_name + def fit(self, utterances: list[str], labels: list[LabelType]) -> None: if isinstance(labels[0], list): self.n_classes = len(labels[0]) @@ -96,12 +99,12 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None: if self.prebuilt_index: # this happens only after RetrievalNode optimization - self._vector_index = vector_index_client.get_index(self.model_name) + self._vector_index = vector_index_client.get_index(self.embedder_name) if len(utterances) != len(self._vector_index.texts): msg = "Vector index mismatches provided utterances" raise ValueError(msg) else: - self._vector_index = vector_index_client.create_index(self.model_name, utterances, labels) + self._vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels) def predict(self, utterances: list[str]) -> npt.NDArray[Any]: labels, distances, _ = self._vector_index.query(utterances, self.k) @@ -141,4 +144,4 @@ def load(self, path: str) -> None: embedder_batch_size=self.metadata["batch_size"], embedder_max_length=self.metadata["max_length"], ) - self._vector_index = vector_index_client.get_index(self.model_name) + self._vector_index = vector_index_client.get_index(self.embedder_name) diff --git a/autointent/modules/scoring/linear.py b/autointent/modules/scoring/linear.py index 30364d20..d40420b2 100644 --- a/autointent/modules/scoring/linear.py +++ b/autointent/modules/scoring/linear.py @@ -48,7 +48,7 @@ class LinearScorer(ScoringModule): def __init__( self, - model_name: str, + embedder_name: str, cv: int = 3, n_jobs: int = -1, device: str = "cpu", @@ -60,7 +60,7 @@ def __init__( self.n_jobs = n_jobs self.device = device self.seed = seed - self.model_name = model_name + self.embedder_name = embedder_name self.batch_size = batch_size self.max_length = max_length @@ -68,16 +68,16 @@ def __init__( def from_context( cls, context: Context, - model_name: str | None = None, + embedder_name: str | None = None, ) -> Self: - if model_name is None: - model_name = context.optimization_info.get_best_embedder() + if embedder_name is None: + embedder_name = context.optimization_info.get_best_embedder() precomputed_embeddings = True else: - precomputed_embeddings = context.vector_index_client.exists(model_name) + precomputed_embeddings = context.vector_index_client.exists(embedder_name) instance = cls( - model_name=model_name, + embedder_name=embedder_name, device=context.get_device(), seed=context.seed, batch_size=context.get_batch_size(), @@ -87,6 +87,9 @@ def from_context( instance.db_dir = str(context.get_db_dir()) return instance + def get_embedder_name(self) -> str: + return self.embedder_name + def fit( self, utterances: list[str], @@ -97,7 +100,7 @@ def fit( if self.precomputed_embeddings: # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization vector_index_client = VectorIndexClient(self.device, self.db_dir, self.batch_size, self.max_length) - vector_index = vector_index_client.get_index(self.model_name) + vector_index = vector_index_client.get_index(self.embedder_name) features = vector_index.get_all_embeddings() if len(features) != len(utterances): msg = "Vector index mismatches provided utterances" @@ -105,7 +108,10 @@ def fit( embedder = vector_index.embedder else: embedder = Embedder( - device=self.device, model_name=self.model_name, batch_size=self.batch_size, max_length=self.max_length + device=self.device, + model_name=self.embedder_name, + batch_size=self.batch_size, + max_length=self.max_length, ) features = embedder.embed(utterances) diff --git a/autointent/modules/scoring/mlknn/mlknn.py b/autointent/modules/scoring/mlknn/mlknn.py index 495a676c..8b4a3025 100644 --- a/autointent/modules/scoring/mlknn/mlknn.py +++ b/autointent/modules/scoring/mlknn/mlknn.py @@ -42,7 +42,7 @@ class MLKnnScorer(ScoringModule): def __init__( self, k: int, - model_name: str, + embedder_name: str, db_dir: str | None = None, s: float = 1.0, ignore_first_neighbours: int = 0, @@ -53,7 +53,7 @@ def __init__( if db_dir is None: db_dir = str(get_db_dir()) self.k = k - self.model_name = model_name + self.embedder_name = embedder_name self.s = s self.ignore_first_neighbours = ignore_first_neighbours self.db_dir = db_dir @@ -68,17 +68,17 @@ def from_context( k: int, s: float = 1.0, ignore_first_neighbours: int = 0, - model_name: str | None = None, + embedder_name: str | None = None, ) -> Self: - if model_name is None: - model_name = context.optimization_info.get_best_embedder() + if embedder_name is None: + embedder_name = context.optimization_info.get_best_embedder() prebuilt_index = True else: - prebuilt_index = context.vector_index_client.exists(model_name) + prebuilt_index = context.vector_index_client.exists(embedder_name) instance = cls( k=k, - model_name=model_name, + embedder_name=embedder_name, s=s, ignore_first_neighbours=ignore_first_neighbours, db_dir=str(context.get_db_dir()), @@ -89,6 +89,9 @@ def from_context( instance.prebuilt_index = prebuilt_index return instance + def get_embedder_name(self) -> str: + return self.embedder_name + def fit(self, utterances: list[str], labels: list[LabelType]) -> None: if not isinstance(labels[0], list): msg = "mlknn scorer support only multilabel input" @@ -100,12 +103,12 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None: if self.prebuilt_index: # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization - self.vector_index = vector_index_client.get_index(self.model_name) + self.vector_index = vector_index_client.get_index(self.embedder_name) if len(utterances) != len(self.vector_index.texts): msg = "Vector index mismatches provided utterances" raise ValueError(msg) else: - self.vector_index = vector_index_client.create_index(self.model_name, utterances, labels) + self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels) self.features = ( self.vector_index.embedder.embed(utterances) @@ -218,4 +221,4 @@ def load(self, path: str) -> None: embedder_batch_size=self.metadata["batch_size"], embedder_max_length=self.metadata["max_length"], ) - self.vector_index = vector_index_client.get_index(self.model_name) + self.vector_index = vector_index_client.get_index(self.embedder_name) diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py index 2a483556..40ad41d3 100644 --- a/autointent/nodes/optimization/node_optimizer.py +++ b/autointent/nodes/optimization/node_optimizer.py @@ -39,6 +39,10 @@ def fit(self, context: Context) -> None: self._logger.debug("initializing %s module...", module_type) module = self.node_info.modules_available[module_type].from_context(context, **module_kwargs) + embedder_name = module.get_embedder_name() + if embedder_name is not None: + module_kwargs["embedder_name"] = embedder_name + self._logger.debug("optimizing %s module...", module_type) self.module_fit(module, context) diff --git a/tests/assets/configs/description.yaml b/tests/assets/configs/description.yaml index 3cec5470..749b646f 100644 --- a/tests/assets/configs/description.yaml +++ b/tests/assets/configs/description.yaml @@ -4,7 +4,7 @@ nodes: search_space: - module_type: vector_db k: [10] - model_name: + embedder_name: - sentence-transformers/all-MiniLM-L6-v2 - node_type: scoring metric: scoring_roc_auc diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml index b61f1a62..84dc7539 100644 --- a/tests/assets/configs/multiclass.yaml +++ b/tests/assets/configs/multiclass.yaml @@ -4,7 +4,7 @@ nodes: search_space: - module_type: vector_db k: [10] - model_name: + embedder_name: - sentence-transformers/all-MiniLM-L6-v2 - avsolatorio/GIST-small-Embedding-v0 - node_type: scoring @@ -18,8 +18,6 @@ nodes: cross_encoder_name: - cross-encoder/ms-marco-MiniLM-L-6-v2 - avsolatorio/GIST-small-Embedding-v0 - search_model_name: - - sergeyzh/rubert-tiny-turbo k: [1, 3] train_head: [false, true] - node_type: prediction diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml index 7a0358f8..e9d439da 100644 --- a/tests/assets/configs/multilabel.yaml +++ b/tests/assets/configs/multilabel.yaml @@ -4,7 +4,7 @@ nodes: search_space: - module_type: vector_db k: [10] - model_name: + embedder_name: - sentence-transformers/all-MiniLM-L6-v2 - avsolatorio/GIST-small-Embedding-v0 - node_type: scoring diff --git a/tests/assets/data/utterances.json b/tests/assets/data/utterances.json new file mode 100644 index 00000000..4d22a851 --- /dev/null +++ b/tests/assets/data/utterances.json @@ -0,0 +1,62 @@ +[ + "yes", + "can you give me a moderately priced restaurant", + "thank you good bye", + "vegetarian", + "what is the eagles address", + "telephone", + "what area is it in", + "post code", + "thank you and good bye", + "yes im looking for a traditional restaurant in the expensive price range", + "im trying to find a vegetarian restaurant and i dont care regarding the price range", + "does it have a television", + "hi im looking for a pub", + "thank you goodbye", + "yes i would like to know about a restaurant", + "thank you good bye", + "no", + "no no", + "uh what are some other eareas", + "no", + "thank you good bye", + "end of system audio no im looking for a seafood restaurant", + "ok what is the address phone number and price", + "yes please", + "yeah lets have you got anything in the mediterranean food in the area", + "what about any other area", + "no particular venue", + "whats the uh thank you and goodbye", + "next type of food cherry hinton area", + "breathing ok thank you goodbye", + "can you select me another venue", + "next choice", + "ok thank you good bye", + "ok thank you goodbye", + "noise ah hi i am looking for an", + "okay and uh", + "okay thank you goodbye", + "hi im looking for a pub having internet connection and have a tv", + "ah hi ah i am looking for a thia restaurant", + "hi im looking for a mediterranean restaurant in the rosemary area", + "is it in the cheap price range", + "is it in the kings hedge area", + "hi im looking for a contemporary restaurant and is it should be free", + "repeat", + "is it located in addenbrookes area", + "is it in the city center jesus christ this is ridiculous", + "repeat", + "say again", + "repeat", + "repeat", + "can we start again", + "start over", + "not spanish food not spanish food fast", + "start over", + "uh cheap or moderate", + "start over", + "id like to start over please", + "i need a pub not bakers", + "i didnt ask for a moderate price range i need a pub with internet and tv", + "not so expensive price range" +] \ No newline at end of file diff --git a/tests/modules/prediction/test_treshold.py b/tests/modules/prediction/test_treshold.py index 3c602aae..824d3a57 100644 --- a/tests/modules/prediction/test_treshold.py +++ b/tests/modules/prediction/test_treshold.py @@ -11,7 +11,7 @@ def get_fit_data(db_dir, dataset): knn_params = { "k": 3, "weights": "distance", - "model_name": "sergeyzh/rubert-tiny-turbo", + "embedder_name": "sergeyzh/rubert-tiny-turbo", "db_dir": db_dir, } scorer = KNNScorer(**knn_params) diff --git a/tests/modules/retrieval/test_vectordb.py b/tests/modules/retrieval/test_vectordb.py index d3002078..2d72753e 100644 --- a/tests/modules/retrieval/test_vectordb.py +++ b/tests/modules/retrieval/test_vectordb.py @@ -2,6 +2,6 @@ def test_get_assets_returns_correct_artifact(tmp_path): - module = VectorDBModule(k=5, model_name="sergeyzh/rubert-tiny-turbo", db_dir=str(tmp_path)) + module = VectorDBModule(k=5, embedder_name="sergeyzh/rubert-tiny-turbo", db_dir=str(tmp_path)) artifact = module.get_assets() assert artifact.embedder_name == "sergeyzh/rubert-tiny-turbo" diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py index 645019b6..c50e0a2e 100644 --- a/tests/modules/scoring/test_description.py +++ b/tests/modules/scoring/test_description.py @@ -17,7 +17,7 @@ def test_description_scorer(dataset, expected_prediction, multilabel): db_dir, dump_dir, logs_dir = setup_environment() data_handler = DataHandler(dataset, force_multilabel=multilabel) - scorer = DescriptionScorer(model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir, temperature=0.3) + scorer = DescriptionScorer(embedder_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir, temperature=0.3) scorer.fit(data_handler.utterances_train, data_handler.labels_train, data_handler.label_description) assert scorer.description_vectors.shape[0] == len(data_handler.label_description) diff --git a/tests/modules/scoring/test_knn.py b/tests/modules/scoring/test_knn.py index f2be73ab..735adc1e 100644 --- a/tests/modules/scoring/test_knn.py +++ b/tests/modules/scoring/test_knn.py @@ -10,7 +10,7 @@ def test_base_knn(dataset): data_handler = DataHandler(dataset) - scorer = KNNScorer(k=3, weights="distance", model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir) + scorer = KNNScorer(k=3, weights="distance", embedder_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir) scorer.fit(data_handler.utterances_train, data_handler.labels_train) predictions = scorer.predict( diff --git a/tests/modules/scoring/test_mlknn.py b/tests/modules/scoring/test_mlknn.py index c76318f1..6cf44386 100644 --- a/tests/modules/scoring/test_mlknn.py +++ b/tests/modules/scoring/test_mlknn.py @@ -24,7 +24,7 @@ def test_base_mlknn(dataset): ) data_handler = DataHandler(dataset, test_dataset, force_multilabel=True) - scorer = MLKnnScorer(db_dir=db_dir, k=3, model_name="sergeyzh/rubert-tiny-turbo") + scorer = MLKnnScorer(db_dir=db_dir, k=3, embedder_name="sergeyzh/rubert-tiny-turbo") scorer.fit(data_handler.utterances_train, data_handler.labels_train) predictions = scorer.predict_labels( diff --git a/tests/modules/test_regex.py b/tests/modules/test_regex.py index e73501cb..bd834d89 100644 --- a/tests/modules/test_regex.py +++ b/tests/modules/test_regex.py @@ -76,7 +76,7 @@ def test_base_regex(): db_dir=db_dir, ) - retrieval_params = {"k": 3, "model_name": "sergeyzh/rubert-tiny-turbo"} + retrieval_params = {"k": 3, "embedder_name": "sergeyzh/rubert-tiny-turbo"} vector_db = VectorDBModule(**retrieval_params) vector_db.fit(context) metric_value = vector_db.score(context, retrieval_hit_rate) diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index da221e2e..376d6182 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -26,7 +26,7 @@ def get_retrieval_optimizer(multilabel: bool): "search_space": [ { "k": [10], - "model_name": [ + "embedder_name": [ "sentence-transformers/all-MiniLM-L6-v2", ], "module_type": "vector_db", diff --git a/tests/nodes/test_retrieval.py b/tests/nodes/test_retrieval.py index 5d625692..683ca59d 100644 --- a/tests/nodes/test_retrieval.py +++ b/tests/nodes/test_retrieval.py @@ -57,7 +57,7 @@ def get_retrieval_optimizer(multilabel: bool): "search_space": [ { "k": [10], - "model_name": [ + "embedder_name": [ "sentence-transformers/all-MiniLM-L6-v2", "avsolatorio/GIST-small-Embedding-v0", ], diff --git a/tests/nodes/test_scoring.py b/tests/nodes/test_scoring.py index 8d037df2..70eddf91 100644 --- a/tests/nodes/test_scoring.py +++ b/tests/nodes/test_scoring.py @@ -22,23 +22,23 @@ def test_scoring_multiclass(context, retrieval_optimizer_multiclass): "module_type": "knn", "k": [3], "weights": ["uniform", "distance", "closest"], - "model_name": ["sergeyzh/rubert-tiny-turbo"], + "embedder_name": ["sergeyzh/rubert-tiny-turbo"], }, { "module_type": "linear", - "model_name": ["sergeyzh/rubert-tiny-turbo"], + "embedder_name": ["sergeyzh/rubert-tiny-turbo"], }, { "module_type": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], - "search_model_name": ["sergeyzh/rubert-tiny-turbo"], + "embedder_name": ["sergeyzh/rubert-tiny-turbo"], "k": [3], "train_head": [False, True], }, { "module_type": "description", "temperature": [1.0, 0.5, 0.1, 0.05], - "model_name": ["sergeyzh/rubert-tiny-turbo"], + "embedder_name": ["sergeyzh/rubert-tiny-turbo"], }, ], } @@ -73,13 +73,13 @@ def test_scoring_multilabel(context, retrieval_optimizer_multilabel): "module_type": "knn", "weights": ["uniform", "distance", "closest"], "k": [3], - "model_name": ["sergeyzh/rubert-tiny-turbo"], + "embedder_name": ["sergeyzh/rubert-tiny-turbo"], }, { "module_type": "linear", - "model_name": ["sergeyzh/rubert-tiny-turbo"], + "embedder_name": ["sergeyzh/rubert-tiny-turbo"], }, - {"module_type": "mlknn", "k": [5], "model_name": ["sergeyzh/rubert-tiny-turbo"]}, + {"module_type": "mlknn", "k": [5], "embedder_name": ["sergeyzh/rubert-tiny-turbo"]}, ], } diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py index 273c205b..d7facbea 100644 --- a/tests/pipeline/test_inference.py +++ b/tests/pipeline/test_inference.py @@ -103,9 +103,9 @@ def test_inference_pipeline_cli(dataset, task_type): context.dump() config = InferenceConfig( - data_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"), + data_path=ires.files("tests.assets.data").joinpath("utterances.json"), source_dir=logging_config.dirpath, - output_path=logging_config.dump_dir, + output_path=logging_config.dump_dir / "predictions.json", log_level="CRITICAL", ) inference_pipeline(config) From 7ccbca2e15ae78ac7445a29d7cc8313f96232247 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 11 Nov 2024 11:35:47 +0300 Subject: [PATCH 21/21] response to review --- autointent/context/context.py | 6 +++--- autointent/modules/base.py | 1 + autointent/pipeline/optimization/cli_endpoint.py | 6 +++--- autointent/pipeline/optimization/pipeline_optimizer.py | 4 ++-- tests/nodes/conftest.py | 6 +++--- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/autointent/context/context.py b/autointent/context/context.py index 4e67a614..2e65305b 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -32,11 +32,11 @@ def __init__( self.seed = seed self._logger = logging.getLogger(__name__) - def config_logs(self, config: LoggingConfig) -> None: + def configure_logging(self, config: LoggingConfig) -> None: self.logging_config = config self.optimization_info = OptimizationInfo() - def config_vector_index(self, config: VectorIndexConfig, embedder_config: EmbedderConfig | None = None) -> None: + def configure_vector_index(self, config: VectorIndexConfig, embedder_config: EmbedderConfig | None = None) -> None: self.vector_index_config = config if embedder_config is None: embedder_config = EmbedderConfig() @@ -49,7 +49,7 @@ def config_vector_index(self, config: VectorIndexConfig, embedder_config: Embedd self.embedder_config.max_length, ) - def config_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None: + def configure_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None: if augmentation_config is not None: self.augmentation_config = AugmentationConfig() augmenter = DataAugmenter( diff --git a/autointent/modules/base.py b/autointent/modules/base.py index 1aaa7eac..e89ef49c 100644 --- a/autointent/modules/base.py +++ b/autointent/modules/base.py @@ -54,4 +54,5 @@ def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> Self: pass def get_embedder_name(self) -> str | None: + """experimental method""" return None diff --git a/autointent/pipeline/optimization/cli_endpoint.py b/autointent/pipeline/optimization/cli_endpoint.py index a8bfa975..ef2cd70e 100644 --- a/autointent/pipeline/optimization/cli_endpoint.py +++ b/autointent/pipeline/optimization/cli_endpoint.py @@ -19,9 +19,9 @@ def main(cfg: OptimizationConfig) -> None: # create shared objects for a whole pipeline context = Context(cfg.seed) - context.config_logs(cfg.logs) - context.config_vector_index(cfg.vector_index, cfg.embedder) - context.config_data(cfg.data, cfg.augmentation) + context.configure_logging(cfg.logs) + context.configure_vector_index(cfg.vector_index, cfg.embedder) + context.configure_data(cfg.data, cfg.augmentation) # run optimization search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger) diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py index e4629aee..d85c0747 100644 --- a/autointent/pipeline/optimization/pipeline_optimizer.py +++ b/autointent/pipeline/optimization/pipeline_optimizer.py @@ -54,8 +54,8 @@ def optimize_from_dataset( ) -> Context: context = Context() context.set_datasets(train_data, val_data, force_multilabel) - context.config_logs(self.logging_config) - context.config_vector_index(self.vector_index_config, self.embedder_config) + context.configure_logging(self.logging_config) + context.configure_vector_index(self.vector_index_config, self.embedder_config) self.optimize(context) self.inference_config = context.optimization_info.get_inference_nodes_config() diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index 376d6182..9cde1f71 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -75,9 +75,9 @@ def context(dataset_path): def _context(multilabel: bool): res = Context() - res.config_data(DataConfig(dataset_path, force_multilabel=multilabel)) - res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True)) - res.config_vector_index(VectorIndexConfig(db_dir=db_dir)) + res.configure_data(DataConfig(dataset_path, force_multilabel=multilabel)) + res.configure_logging(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True)) + res.configure_vector_index(VectorIndexConfig(db_dir=db_dir)) return res return _context