From f3d77775fca7a4f6ae537326716cde2ead4340ff Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 12:14:42 +0300
Subject: [PATCH 01/21] stage result

---
 .../optimization/utils => configs}/name.py    |   6 +
 autointent/configs/optimization_cli.py        |   2 +-
 autointent/context/context.py                 | 157 +++++++++++++++---
 .../{ => context/optimization_info}/logger.py |   0
 .../optimization_info/optimization_info.py    |   2 +-
 autointent/modules/retrieval/vectordb.py      |   8 +-
 autointent/modules/scoring/knn/knn.py         |   8 +-
 autointent/modules/scoring/linear.py          |   8 +-
 .../nodes/optimization/node_optimizer.py      |  15 +-
 .../pipeline/optimization/cli_endpoint.py     |  21 +--
 .../optimization/pipeline_optimizer.py        |  44 -----
 .../pipeline/optimization/utils/__init__.py   |   8 +-
 autointent/pipeline/optimization/utils/cli.py |  27 ---
 .../pipeline/optimization/utils/dump.py       |  20 ---
 14 files changed, 177 insertions(+), 149 deletions(-)
 rename autointent/{pipeline/optimization/utils => configs}/name.py (95%)
 rename autointent/{ => context/optimization_info}/logger.py (100%)
 delete mode 100644 autointent/pipeline/optimization/utils/dump.py

diff --git a/autointent/pipeline/optimization/utils/name.py b/autointent/configs/name.py
similarity index 95%
rename from autointent/pipeline/optimization/utils/name.py
rename to autointent/configs/name.py
index 73e1be68..8caaed8e 100644
--- a/autointent/pipeline/optimization/utils/name.py
+++ b/autointent/configs/name.py
@@ -1,4 +1,5 @@
 import random
+from datetime import datetime
 
 adjectives = [
     "adorable",
@@ -342,3 +343,8 @@ def generate_name() -> str:
     adjective = random.choice(adjectives)
     noun = random.choice(nouns)
     return f"{adjective}_{noun}"
+
+def get_run_name(run_name: str | None = None) -> str:
+    if run_name is None:
+        run_name = generate_name()
+    return f"{run_name}_{datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}"  # noqa: DTZ005
diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py
index bcc2d600..576ba64f 100644
--- a/autointent/configs/optimization_cli.py
+++ b/autointent/configs/optimization_cli.py
@@ -6,7 +6,7 @@
 from hydra.core.config_store import ConfigStore
 from omegaconf import MISSING
 
-from autointent.pipeline.optimization.utils import generate_name
+from .name import generate_name
 
 
 @dataclass
diff --git a/autointent/context/context.py b/autointent/context/context.py
index d2d46014..1ea32b2b 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -1,42 +1,77 @@
+import importlib.resources as ires
+import json
+import logging
 from dataclasses import asdict
 from pathlib import Path
 from typing import Any
 
+import numpy as np
+import yaml
+from omegaconf import ListConfig
+
+from autointent.configs.optimization_cli import (
+    AugmentationConfig,
+    DataConfig,
+    EmbedderConfig,
+    LoggingConfig,
+    VectorIndexConfig,
+)
+
 from .data_handler import DataAugmenter, DataHandler, Dataset
 from .optimization_info import OptimizationInfo
 from .vector_index_client import VectorIndex, VectorIndexClient
 
 
 class Context:
-    def __init__(  # noqa: PLR0913
+    data_handler: DataHandler
+    vector_index_client: VectorIndexClient
+    optimization_info: OptimizationInfo
+
+    def __init__(
         self,
-        dataset: Dataset,
-        test_dataset: Dataset | None = None,
-        device: str = "cpu",
-        multilabel_generation_config: str | None = None,
-        regex_sampling: int = 0,
         seed: int = 42,
-        db_dir: str | Path | None = None,
-        dump_dir: str | Path | None = None,
-        force_multilabel: bool = False,
-        embedder_batch_size: int = 32,
-        embedder_max_length: int | None = None,
     ) -> None:
-        augmenter = DataAugmenter(multilabel_generation_config, regex_sampling, seed)
+        self.seed = seed
+        self._logger = logging.getLogger(__name__)
+
+    def config_logs(self, config: LoggingConfig) -> None:
+        self.logging_config = config
+        self.optimization_info = OptimizationInfo()
+
+    def config_vector_index(self, config: VectorIndexConfig, embedder_config: EmbedderConfig | None = None) -> None:
+        self.vector_index_config = config
+        if embedder_config is None:
+            embedder_config = EmbedderConfig()
+        self.embedder_config = embedder_config
+
+        self.vector_index_client = VectorIndexClient(
+            self.vector_index_config.device,
+            self.vector_index_config.db_dir,
+            self.embedder_config.batch_size,
+            self.embedder_config.max_length,
+        )
+
+    def config_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None:
+        if augmentation_config is not None:
+            self.augmentation_config = AugmentationConfig()
+            augmenter = DataAugmenter(
+                self.augmentation_config.multilabel_generation_config,
+                self.augmentation_config.regex_sampling,
+                self.seed,
+            )
+        else:
+            augmenter = None
+
         self.data_handler = DataHandler(
-            dataset, test_dataset, random_seed=seed, force_multilabel=force_multilabel, augmenter=augmenter
+            dataset=load_data(config.train_path),
+            test_dataset=None if config.test_path is None else load_data(config.test_path),
+            random_seed=self.seed,
+            force_multilabel=config.force_multilabel,
+            augmenter=augmenter,
         )
-        self.optimization_info = OptimizationInfo()
-        self.vector_index_client = VectorIndexClient(device, db_dir, embedder_batch_size, embedder_max_length)
 
-        self.db_dir = self.vector_index_client.db_dir
-        self.embedder_max_length = embedder_max_length
-        self.embedder_batch_size = embedder_batch_size
-        self.device = device
         self.multilabel = self.data_handler.multilabel
         self.n_classes = self.data_handler.n_classes
-        self.seed = seed
-        self.dump_dir = Path.cwd() / "modules_dumps" if dump_dir is None else Path(dump_dir)
 
     def get_best_index(self) -> VectorIndex:
         model_name = self.optimization_info.get_best_embedder()
@@ -55,3 +90,83 @@ def get_inference_config(self) -> dict[str, Any]:
             },
             "nodes_configs": nodes_configs,
         }
+
+    def dump(self) -> None:
+        self._logger.debug("dumping logs...")
+        optimization_results = self.optimization_info.dump_evaluation_results()
+
+        logs_dir = self.logging_config.dirpath
+
+        # create appropriate directory
+        logs_dir.mkdir(parents=True, exist_ok=True)
+
+        # dump search space and evaluation results
+        logs_path = logs_dir / "logs.json"
+        with logs_path.open("w") as file:
+            json.dump(optimization_results, file, indent=4, ensure_ascii=False, cls=NumpyEncoder)
+        # config_path = logs_dir / "config.yaml"
+        # with config_path.open("w") as file:
+        #     yaml.dump(self.config, file)
+
+        # self._logger.info(make_report(optimization_results, nodes=nodes))
+
+        # dump train and test data splits
+        train_data, test_data = self.data_handler.dump()
+        train_path = logs_dir / "train_data.json"
+        test_path = logs_dir / "test_data.json"
+        with train_path.open("w") as file:
+            json.dump(train_data, file, indent=4, ensure_ascii=False)
+        with test_path.open("w") as file:
+            json.dump(test_data, file, indent=4, ensure_ascii=False)
+
+        self._logger.info("logs and other assets are saved to %s", logs_dir)
+
+        # dump optimization results (config for inference)
+        inference_config = self.get_inference_config()
+        inference_config_path = logs_dir / "inference_config.yaml"
+        with inference_config_path.open("w") as file:
+            yaml.dump(inference_config, file)
+
+    def get_db_dir(self) -> Path:
+        return self.vector_index_client.db_dir
+
+    def get_device(self) -> str:
+        return self.vector_index_client.device
+
+    def get_batch_size(self) -> int:
+        return self.vector_index_client.embedder_batch_size
+
+    def get_max_length(self) -> int | None:
+        return self.vector_index_client.embedder_max_length
+
+    def get_dump_dir(self) -> Path:
+        return self.logging_config.dump_dir
+
+class NumpyEncoder(json.JSONEncoder):
+    """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422"""
+
+    def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN401
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        if isinstance(obj, ListConfig):
+            return list(obj)
+        return super().default(obj)
+
+
+def load_data(data_path: str | Path) -> Dataset:
+    """load data from the given path or load sample data which is distributed along with the autointent package"""
+    if data_path == "default-multiclass":
+        with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file:
+            res = json.load(file)
+    elif data_path == "default-multilabel":
+        with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file:
+            res = json.load(file)
+    else:
+        with Path(data_path).open() as file:
+            res = json.load(file)
+
+    return Dataset.model_validate(res)
diff --git a/autointent/logger.py b/autointent/context/optimization_info/logger.py
similarity index 100%
rename from autointent/logger.py
rename to autointent/context/optimization_info/logger.py
diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 36795151..3cfa2a72 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -4,9 +4,9 @@
 from numpy.typing import NDArray
 
 from autointent.configs.node import InferenceNodeConfig
-from autointent.logger import get_logger
 
 from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
+from .logger import get_logger
 
 
 class OptimizationInfo:
diff --git a/autointent/modules/retrieval/vectordb.py b/autointent/modules/retrieval/vectordb.py
index cae9beb6..b1e55380 100644
--- a/autointent/modules/retrieval/vectordb.py
+++ b/autointent/modules/retrieval/vectordb.py
@@ -51,10 +51,10 @@ def from_context(
         return cls(
             k=k,
             model_name=model_name,
-            db_dir=str(context.db_dir),
-            device=context.device,
-            batch_size=context.embedder_batch_size,
-            max_length=context.embedder_max_length,
+            db_dir=str(context.get_db_dir()),
+            device=context.get_device(),
+            batch_size=context.get_batch_size(),
+            max_length=context.get_max_length(),
         )
 
     def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
diff --git a/autointent/modules/scoring/knn/knn.py b/autointent/modules/scoring/knn/knn.py
index 0c53cadf..420bf604 100644
--- a/autointent/modules/scoring/knn/knn.py
+++ b/autointent/modules/scoring/knn/knn.py
@@ -76,10 +76,10 @@ def from_context(
             model_name=model_name,
             k=k,
             weights=weights,
-            db_dir=str(context.db_dir),
-            device=context.device,
-            batch_size=context.embedder_batch_size,
-            max_length=context.embedder_max_length,
+            db_dir=str(context.get_db_dir()),
+            device=context.get_device(),
+            batch_size=context.get_batch_size(),
+            max_length=context.get_max_length(),
         )
         instance.prebuilt_index = prebuilt_index
         return instance
diff --git a/autointent/modules/scoring/linear.py b/autointent/modules/scoring/linear.py
index adab5b43..685eb74b 100644
--- a/autointent/modules/scoring/linear.py
+++ b/autointent/modules/scoring/linear.py
@@ -77,13 +77,13 @@ def from_context(
 
         instance = cls(
             model_name=model_name,
-            device=context.device,
+            device=context.get_device(),
             seed=context.seed,
-            batch_size=context.embedder_batch_size,
-            max_length=context.embedder_max_length,
+            batch_size=context.get_batch_size(),
+            max_length=context.get_max_length(),
         )
         instance.precomputed_embeddings = precomputed_embeddings
-        instance.db_dir = str(context.db_dir)
+        instance.db_dir = str(context.get_db_dir())
         return instance
 
     def fit(
diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index 72139644..77c94652 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -46,7 +46,7 @@ def fit(self, context: Context) -> None:
                 metric_value = module.score(context, self.node_info.metrics_available[self.metric_name])
 
                 assets = module.get_assets()
-                module_dump_dir = self.get_module_dump_dir(context.dump_dir, module_type, j_combination)
+                module_dump_dir = self.get_module_dump_dir(context.get_dump_dir(), module_type, j_combination)
                 module.dump(module_dump_dir)
 
                 context.optimization_info.log_module_optimization(
@@ -88,3 +88,16 @@ def module_fit(self, module: Module, context: Context) -> None:
             self._logger.error(msg)
             raise ValueError(msg)
         module.fit(*args)  # type: ignore[arg-type]
+
+    # @overload
+    # def fit(
+    #     self,
+    #     utterances: list[str],
+    #     labels: list[LabelType],
+    #     tags: list[Tag] | None = None,
+    #     label_descriptions: list[str] | None = None,
+    # ) -> None:
+    #     # create context object from given data
+    #     ...
+
+    #     # call fit(context)
diff --git a/autointent/pipeline/optimization/cli_endpoint.py b/autointent/pipeline/optimization/cli_endpoint.py
index eb390804..806ebb45 100644
--- a/autointent/pipeline/optimization/cli_endpoint.py
+++ b/autointent/pipeline/optimization/cli_endpoint.py
@@ -6,7 +6,7 @@
 from autointent.configs.optimization_cli import OptimizationConfig
 
 from .pipeline_optimizer import PipelineOptimizer
-from .utils import load_config, load_data
+from .utils import load_config
 
 
 @hydra.main(config_name="optimization_config", config_path=".", version_base=None)
@@ -18,19 +18,10 @@ def main(cfg: OptimizationConfig) -> None:
     logger.debug("Vector index path: %s", cfg.vector_index.db_dir)
 
     # create shared objects for a whole pipeline
-    context = Context(
-        load_data(cfg.data.train_path),
-        None if cfg.data.test_path is None else load_data(cfg.data.test_path),
-        cfg.vector_index.device,
-        cfg.augmentation.multilabel_generation_config,
-        cfg.augmentation.regex_sampling,
-        cfg.seed,
-        cfg.vector_index.db_dir,
-        cfg.logs.dump_dir,
-        cfg.data.force_multilabel,
-        cfg.embedder.batch_size,
-        cfg.embedder.max_length,
-    )
+    context = Context(cfg.seed)
+    context.config_logs(cfg.logs)
+    context.config_vector_index(cfg.vector_index, cfg.embedder)
+    context.config_data(cfg.data, cfg.augmentation)
 
     # run optimization
     search_space_config = load_config(cfg.task.search_space_path, context.multilabel, logger)
@@ -38,4 +29,4 @@ def main(cfg: OptimizationConfig) -> None:
     pipeline.optimize(context)
 
     # save results
-    pipeline.dump(cfg.logs.dirpath)
+    context.dump()
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index d0532164..5125b8c5 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -1,18 +1,14 @@
 import json
 import logging
-from pathlib import Path
 from typing import Any
 
 import numpy as np
-import yaml
 from hydra.utils import instantiate
 
 from autointent import Context
 from autointent.configs.pipeline_optimizer import PipelineOptimizerConfig
 from autointent.nodes import NodeOptimizer
 
-from .utils import NumpyEncoder
-
 
 class PipelineOptimizer:
     def __init__(self, nodes: list[NodeOptimizer]) -> None:
@@ -29,46 +25,6 @@ def optimize(self, context: Context) -> None:
         for node_optimizer in self.nodes:
             node_optimizer.fit(context)
 
-    def dump(self, logs_dir: str | Path | None) -> None:
-        self._logger.debug("dumping logs...")
-        optimization_results = self.context.optimization_info.dump_evaluation_results()
-
-        if logs_dir is None:
-            logs_dir = Path.cwd() / "pipeline_optimize"
-        if isinstance(logs_dir, str):
-            logs_dir = Path(logs_dir)
-
-        # create appropriate directory
-        logs_dir.mkdir(parents=True, exist_ok=True)
-
-        # dump search space and evaluation results
-        logs_path = logs_dir / "logs.json"
-        with logs_path.open("w") as file:
-            json.dump(optimization_results, file, indent=4, ensure_ascii=False, cls=NumpyEncoder)
-        # config_path = logs_dir / "config.yaml"
-        # with config_path.open("w") as file:
-        #     yaml.dump(self.config, file)
-
-        nodes = [node_config.node_info.node_type for node_config in self.nodes]
-        self._logger.info(make_report(optimization_results, nodes=nodes))
-
-        # dump train and test data splits
-        train_data, test_data = self.context.data_handler.dump()
-        train_path = logs_dir / "train_data.json"
-        test_path = logs_dir / "test_data.json"
-        with train_path.open("w") as file:
-            json.dump(train_data, file, indent=4, ensure_ascii=False)
-        with test_path.open("w") as file:
-            json.dump(test_data, file, indent=4, ensure_ascii=False)
-
-        self._logger.info("logs and other assets are saved to %s", logs_dir)
-
-        # dump optimization results (config for inference)
-        inference_config = self.context.get_inference_config()
-        inference_config_path = logs_dir / "inference_config.yaml"
-        with inference_config_path.open("w") as file:
-            yaml.dump(inference_config, file)
-
 
 def make_report(logs: dict[str, Any], nodes: list[str]) -> str:
     ids = [np.argmax(logs["metrics"][node]) for node in nodes]
diff --git a/autointent/pipeline/optimization/utils/__init__.py b/autointent/pipeline/optimization/utils/__init__.py
index 4bd0f741..e0aa17b6 100644
--- a/autointent/pipeline/optimization/utils/__init__.py
+++ b/autointent/pipeline/optimization/utils/__init__.py
@@ -1,12 +1,6 @@
-from .cli import get_logs_dir, get_run_name, load_config, load_data
-from .dump import NumpyEncoder
-from .name import generate_name
+from .cli import get_logs_dir, load_config
 
 __all__ = [
-    "NumpyEncoder",
-    "generate_name",
-    "get_run_name",
     "load_config",
-    "load_data",
     "get_logs_dir",
 ]
diff --git a/autointent/pipeline/optimization/utils/cli.py b/autointent/pipeline/optimization/utils/cli.py
index e49d0624..2cd4e911 100644
--- a/autointent/pipeline/optimization/utils/cli.py
+++ b/autointent/pipeline/optimization/utils/cli.py
@@ -1,37 +1,10 @@
 import importlib.resources as ires
-import json
-from datetime import datetime
 from logging import Logger
 from pathlib import Path
 from typing import Any
 
 import yaml
 
-from autointent.context.data_handler import Dataset
-
-from .name import generate_name
-
-
-def load_data(data_path: str | Path) -> Dataset:
-    """load data from the given path or load sample data which is distributed along with the autointent package"""
-    if data_path == "default-multiclass":
-        with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file:
-            res = json.load(file)
-    elif data_path == "default-multilabel":
-        with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file:
-            res = json.load(file)
-    else:
-        with Path(data_path).open() as file:
-            res = json.load(file)
-
-    return Dataset.model_validate(res)
-
-
-def get_run_name(run_name: str | None = None) -> str:
-    if run_name is None:
-        run_name = generate_name()
-    return f"{run_name}_{datetime.now().strftime('%m-%d-%Y_%H-%M-%S')}"  # noqa: DTZ005
-
 
 def get_logs_dir(run_name: str, logs_dir: Path | None = None) -> Path:
     if logs_dir is None:
diff --git a/autointent/pipeline/optimization/utils/dump.py b/autointent/pipeline/optimization/utils/dump.py
deleted file mode 100644
index 95167a5e..00000000
--- a/autointent/pipeline/optimization/utils/dump.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import json
-from typing import Any
-
-import numpy as np
-from omegaconf import ListConfig
-
-
-class NumpyEncoder(json.JSONEncoder):
-    """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422"""
-
-    def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN401
-        if isinstance(obj, np.integer):
-            return int(obj)
-        if isinstance(obj, np.floating):
-            return float(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        if isinstance(obj, ListConfig):
-            return list(obj)
-        return super().default(obj)

From b192dc8505216a010038ebef5ea31c97429c6ed6 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 12:27:04 +0300
Subject: [PATCH 02/21] decompose `Context.__init__()` and implement `get_`
 methods

---
 autointent/configs/name.py                       |  1 +
 autointent/context/context.py                    | 16 ++++++++++------
 autointent/modules/prediction/base.py            |  4 +++-
 autointent/modules/prediction/threshold.py       |  4 ++--
 autointent/modules/scoring/dnnc/dnnc.py          |  8 ++++----
 autointent/pipeline/optimization/cli_endpoint.py |  2 +-
 6 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/autointent/configs/name.py b/autointent/configs/name.py
index 8caaed8e..5d639217 100644
--- a/autointent/configs/name.py
+++ b/autointent/configs/name.py
@@ -344,6 +344,7 @@ def generate_name() -> str:
     noun = random.choice(nouns)
     return f"{adjective}_{noun}"
 
+
 def get_run_name(run_name: str | None = None) -> str:
     if run_name is None:
         run_name = generate_name()
diff --git a/autointent/context/context.py b/autointent/context/context.py
index 1ea32b2b..ec2ebac1 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -70,9 +70,6 @@ def config_data(self, config: DataConfig, augmentation_config: AugmentationConfi
             augmenter=augmenter,
         )
 
-        self.multilabel = self.data_handler.multilabel
-        self.n_classes = self.data_handler.n_classes
-
     def get_best_index(self) -> VectorIndex:
         model_name = self.optimization_info.get_best_embedder()
         return self.vector_index_client.get_index(model_name)
@@ -83,9 +80,9 @@ def get_inference_config(self) -> dict[str, Any]:
             cfg.pop("_target_")
         return {
             "metadata": {
-                "device": self.device,
-                "multilabel": self.multilabel,
-                "n_classes": self.n_classes,
+                "device": self.get_device(),
+                "multilabel": self.is_multilabel(),
+                "n_classes": self.get_n_classes(),
                 "seed": self.seed,
             },
             "nodes_configs": nodes_configs,
@@ -142,6 +139,13 @@ def get_max_length(self) -> int | None:
     def get_dump_dir(self) -> Path:
         return self.logging_config.dump_dir
 
+    def is_multilabel(self) -> bool:
+        return self.data_handler.multilabel
+
+    def get_n_classes(self) -> int:
+        return self.data_handler.n_classes
+
+
 class NumpyEncoder(json.JSONEncoder):
     """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422"""
 
diff --git a/autointent/modules/prediction/base.py b/autointent/modules/prediction/base.py
index 06cb2c14..17b8de50 100644
--- a/autointent/modules/prediction/base.py
+++ b/autointent/modules/prediction/base.py
@@ -51,7 +51,9 @@ def get_prediction_evaluation_data(
     oos_scores = context.optimization_info.get_best_oos_scores()
     return_scores = scores
     if oos_scores is not None:
-        oos_labels = [[0] * context.n_classes] * len(oos_scores) if context.multilabel else [-1] * len(oos_scores)  # type: ignore[list-item]
+        oos_labels = (
+            [[0] * context.get_n_classes()] * len(oos_scores) if context.is_multilabel() else [-1] * len(oos_scores)
+        )  # type: ignore[list-item]
         labels = np.concatenate([labels, np.array(oos_labels)])
         return_scores = np.concatenate([scores, oos_scores])
 
diff --git a/autointent/modules/prediction/threshold.py b/autointent/modules/prediction/threshold.py
index 5e43533f..c25c3270 100644
--- a/autointent/modules/prediction/threshold.py
+++ b/autointent/modules/prediction/threshold.py
@@ -44,8 +44,8 @@ def __init__(
     def from_context(cls, context: Context, thresh: float | npt.NDArray[Any] = 0.5) -> Self:
         return cls(
             thresh=thresh,
-            multilabel=context.multilabel,
-            n_classes=context.n_classes,
+            multilabel=context.is_multilabel(),
+            n_classes=context.get_n_classes(),
         )
 
     def fit(
diff --git a/autointent/modules/scoring/dnnc/dnnc.py b/autointent/modules/scoring/dnnc/dnnc.py
index 901056d3..2b20d8ce 100644
--- a/autointent/modules/scoring/dnnc/dnnc.py
+++ b/autointent/modules/scoring/dnnc/dnnc.py
@@ -82,10 +82,10 @@ def from_context(
             search_model_name=search_model_name,
             k=k,
             train_head=train_head,
-            device=context.device,
-            db_dir=str(context.db_dir),
-            batch_size=context.embedder_batch_size,
-            max_length=context.embedder_max_length,
+            device=context.get_device(),
+            db_dir=str(context.get_db_dir()),
+            batch_size=context.get_batch_size(),
+            max_length=context.get_max_length(),
         )
         instance.prebuilt_index = prebuilt_index
         return instance
diff --git a/autointent/pipeline/optimization/cli_endpoint.py b/autointent/pipeline/optimization/cli_endpoint.py
index 806ebb45..a8bfa975 100644
--- a/autointent/pipeline/optimization/cli_endpoint.py
+++ b/autointent/pipeline/optimization/cli_endpoint.py
@@ -24,7 +24,7 @@ def main(cfg: OptimizationConfig) -> None:
     context.config_data(cfg.data, cfg.augmentation)
 
     # run optimization
-    search_space_config = load_config(cfg.task.search_space_path, context.multilabel, logger)
+    search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger)
     pipeline = PipelineOptimizer.from_dict_config(search_space_config)
     pipeline.optimize(context)
 

From 0f6f568d1f6632b20f379cb8aa947d38c286b7e9 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 12:59:59 +0300
Subject: [PATCH 03/21] fix tests

---
 autointent/context/context.py                 | 36 +----------------
 autointent/context/utils.py                   | 39 +++++++++++++++++++
 .../scoring/description/description.py        |  4 +-
 autointent/modules/scoring/mlknn/mlknn.py     |  8 ++--
 tests/conftest.py                             | 16 ++++----
 .../datahandler/test_multilabel_generation.py | 13 ++++---
 tests/context/test_vector_index.py            | 13 ++++---
 tests/modules/prediction/test_treshold.py     | 19 ++++-----
 tests/modules/scoring/test_description.py     |  7 ++--
 tests/modules/scoring/test_dnnc.py            |  5 ++-
 tests/modules/scoring/test_knn.py             |  7 ++--
 tests/modules/scoring/test_linear.py          |  5 ++-
 tests/modules/scoring/test_mlknn.py           |  7 ++--
 tests/modules/test_regex.py                   |  7 ++--
 tests/nodes/conftest.py                       | 12 ++++--
 tests/pipeline/test_optimization.py           | 25 ++----------
 16 files changed, 113 insertions(+), 110 deletions(-)
 create mode 100644 autointent/context/utils.py

diff --git a/autointent/context/context.py b/autointent/context/context.py
index ec2ebac1..23186518 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -1,13 +1,10 @@
-import importlib.resources as ires
 import json
 import logging
 from dataclasses import asdict
 from pathlib import Path
 from typing import Any
 
-import numpy as np
 import yaml
-from omegaconf import ListConfig
 
 from autointent.configs.optimization_cli import (
     AugmentationConfig,
@@ -17,8 +14,9 @@
     VectorIndexConfig,
 )
 
-from .data_handler import DataAugmenter, DataHandler, Dataset
+from .data_handler import DataAugmenter, DataHandler
 from .optimization_info import OptimizationInfo
+from .utils import NumpyEncoder, load_data
 from .vector_index_client import VectorIndex, VectorIndexClient
 
 
@@ -144,33 +142,3 @@ def is_multilabel(self) -> bool:
 
     def get_n_classes(self) -> int:
         return self.data_handler.n_classes
-
-
-class NumpyEncoder(json.JSONEncoder):
-    """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422"""
-
-    def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN401
-        if isinstance(obj, np.integer):
-            return int(obj)
-        if isinstance(obj, np.floating):
-            return float(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        if isinstance(obj, ListConfig):
-            return list(obj)
-        return super().default(obj)
-
-
-def load_data(data_path: str | Path) -> Dataset:
-    """load data from the given path or load sample data which is distributed along with the autointent package"""
-    if data_path == "default-multiclass":
-        with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file:
-            res = json.load(file)
-    elif data_path == "default-multilabel":
-        with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file:
-            res = json.load(file)
-    else:
-        with Path(data_path).open() as file:
-            res = json.load(file)
-
-    return Dataset.model_validate(res)
diff --git a/autointent/context/utils.py b/autointent/context/utils.py
new file mode 100644
index 00000000..efad3884
--- /dev/null
+++ b/autointent/context/utils.py
@@ -0,0 +1,39 @@
+import importlib.resources as ires
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from omegaconf import ListConfig
+
+from .data_handler import Dataset
+
+
+class NumpyEncoder(json.JSONEncoder):
+    """Helper for dumping logs. Problem explained: https://stackoverflow.com/q/50916422"""
+
+    def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN401
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        if isinstance(obj, ListConfig):
+            return list(obj)
+        return super().default(obj)
+
+
+def load_data(data_path: str | Path) -> Dataset:
+    """load data from the given path or load sample data which is distributed along with the autointent package"""
+    if data_path == "default-multiclass":
+        with ires.files("autointent.datafiles").joinpath("banking77.json").open() as file:
+            res = json.load(file)
+    elif data_path == "default-multilabel":
+        with ires.files("autointent.datafiles").joinpath("dstc3-20shot.json").open() as file:
+            res = json.load(file)
+    else:
+        with Path(data_path).open() as file:
+            res = json.load(file)
+
+    return Dataset.model_validate(res)
diff --git a/autointent/modules/scoring/description/description.py b/autointent/modules/scoring/description/description.py
index 97faf7ac..51d1a990 100644
--- a/autointent/modules/scoring/description/description.py
+++ b/autointent/modules/scoring/description/description.py
@@ -63,8 +63,8 @@ def from_context(
 
         instance = cls(
             temperature=temperature,
-            device=context.device,
-            db_dir=context.db_dir,
+            device=context.get_device(),
+            db_dir=context.get_db_dir(),
             model_name=model_name,
         )
         instance.precomputed_embeddings = precomputed_embeddings
diff --git a/autointent/modules/scoring/mlknn/mlknn.py b/autointent/modules/scoring/mlknn/mlknn.py
index e51a223b..fc06eaee 100644
--- a/autointent/modules/scoring/mlknn/mlknn.py
+++ b/autointent/modules/scoring/mlknn/mlknn.py
@@ -80,10 +80,10 @@ def from_context(
             model_name=model_name,
             s=s,
             ignore_first_neighbours=ignore_first_neighbours,
-            db_dir=str(context.db_dir),
-            device=context.device,
-            batch_size=context.embedder_batch_size,
-            max_length=context.embedder_max_length,
+            db_dir=str(context.get_db_dir()),
+            device=context.get_device(),
+            batch_size=context.get_batch_size(),
+            max_length=context.get_max_length(),
         )
         instance.prebuilt_index = prebuilt_index
         return instance
diff --git a/tests/conftest.py b/tests/conftest.py
index 83832e6e..08f95300 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,21 +3,21 @@
 
 import pytest
 
-from autointent.pipeline.optimization.utils import load_data
+from autointent.context.utils import load_data
 
 
-@pytest.fixture
 def setup_environment() -> tuple[str, str]:
     logs_dir = ires.files("tests").joinpath("logs")
+    db_dir = logs_dir / "db" / str(uuid4())
+    dump_dir = logs_dir / "modules_dump"
+    return db_dir, dump_dir, logs_dir
 
-    def get_db_dir():
-        return logs_dir / "db" / str(uuid4())
 
-    dump_dir = logs_dir / "modules_dump"
-    return get_db_dir, dump_dir, logs_dir
+@pytest.fixture
+def dataset_path():
+    return ires.files("tests.assets.data").joinpath("clinc_subset.json")
 
 
 @pytest.fixture
-def dataset():
-    dataset_path = ires.files("tests.assets.data").joinpath("clinc_subset.json")
+def dataset(dataset_path):
     return load_data(dataset_path)
diff --git a/tests/context/datahandler/test_multilabel_generation.py b/tests/context/datahandler/test_multilabel_generation.py
index 6840f29d..9a0bb76b 100644
--- a/tests/context/datahandler/test_multilabel_generation.py
+++ b/tests/context/datahandler/test_multilabel_generation.py
@@ -4,6 +4,7 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.context.vector_index_client import VectorIndexClient
+from tests.conftest import setup_environment
 
 
 @pytest.fixture
@@ -15,12 +16,12 @@ def mock_data_handler():
 
 
 @pytest.fixture
-def vector_index(setup_environment):
-    db_dir, dump_dir, logs_dir = setup_environment
-    return VectorIndexClient(device="cpu", multilabel=False, n_classes=2, db_dir=db_dir())
+def vector_index():
+    db_dir, dump_dir, logs_dir = setup_environment()
+    return VectorIndexClient(device="cpu", multilabel=False, n_classes=2, db_dir=db_dir)
 
 
-def test_vector_index_initialization(setup_environment):
-    db_dir, dump_dir, logs_dir = setup_environment
-    index = VectorIndexClient(device="cpu", db_dir=db_dir())
+def test_vector_index_initialization():
+    db_dir, dump_dir, logs_dir = setup_environment()
+    index = VectorIndexClient(device="cpu", db_dir=db_dir)
     assert index.device == "cpu"
diff --git a/tests/context/test_vector_index.py b/tests/context/test_vector_index.py
index 70ca27a3..0505d9a1 100644
--- a/tests/context/test_vector_index.py
+++ b/tests/context/test_vector_index.py
@@ -1,6 +1,7 @@
 import pytest
 
 from autointent.context.vector_index_client import VectorIndexClient
+from tests.conftest import setup_environment
 
 
 @pytest.fixture
@@ -12,15 +13,15 @@ class MockDataHandler:
     return MockDataHandler()
 
 
-def test_vector_index_initialization(setup_environment):
-    db_dir, dump_dir, logs_dir = setup_environment
-    vector_index_client = VectorIndexClient("cpu", db_dir())
+def test_vector_index_initialization():
+    db_dir, dump_dir, logs_dir = setup_environment()
+    vector_index_client = VectorIndexClient("cpu", db_dir)
     assert vector_index_client.device == "cpu"
 
 
-def test_create_collection(data_handler, setup_environment):
-    db_dir, dump_dir, logs_dir = setup_environment
-    vector_index_client = VectorIndexClient("cpu", db_dir())
+def test_create_collection(data_handler):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    vector_index_client = VectorIndexClient("cpu", db_dir)
     vector_index = vector_index_client.create_index(
         "bert-base-uncased", data_handler.utterances_train, data_handler.labels_train
     )
diff --git a/tests/modules/prediction/test_treshold.py b/tests/modules/prediction/test_treshold.py
index f4052553..3c602aae 100644
--- a/tests/modules/prediction/test_treshold.py
+++ b/tests/modules/prediction/test_treshold.py
@@ -2,6 +2,7 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import KNNScorer, ThresholdPredictor
+from tests.conftest import setup_environment
 
 
 def get_fit_data(db_dir, dataset):
@@ -21,31 +22,31 @@ def get_fit_data(db_dir, dataset):
     return scores, labels
 
 
-def test_predict_returns_correct_indices(setup_environment, dataset):
-    get_db_dir, dump_dir, logs_dir = setup_environment
+def test_predict_returns_correct_indices(dataset):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     predictor = ThresholdPredictor(0.5)
-    predictor.fit(*get_fit_data(get_db_dir(), dataset))
+    predictor.fit(*get_fit_data(db_dir, dataset))
     scores = np.array([[0.1, 0.9], [0.8, 0.2], [0.3, 0.7]])
     predictions = predictor.predict(scores)
     np.testing.assert_array_equal(predictions, np.array([1, 0, 1]))
 
 
-def test_predict_returns_list(setup_environment, dataset):
-    get_db_dir, dump_dir, logs_dir = setup_environment
+def test_predict_returns_list(dataset):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     predictor = ThresholdPredictor(np.array([0.5, 0.5, 0.5]), n_classes=3)
-    predictor.fit(*get_fit_data(get_db_dir(), dataset))
+    predictor.fit(*get_fit_data(db_dir, dataset))
     scores = np.array([[0.1, 0.9], [0.8, 0.2], [0.3, 0.7]])
     predictions = predictor.predict(scores)
     np.testing.assert_array_equal(predictions, np.array([1, 0, 1]))
 
 
-def test_predict_handles_single_class(setup_environment, dataset):
-    get_db_dir, dump_dir, logs_dir = setup_environment
+def test_predict_handles_single_class(dataset):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     predictor = ThresholdPredictor(0.5)
-    predictor.fit(*get_fit_data(get_db_dir(), dataset))
+    predictor.fit(*get_fit_data(db_dir, dataset))
     scores = np.array([[0.5], [0.5], [0.5]])
     predictions = predictor.predict(scores)
     np.testing.assert_array_equal(predictions, np.array([0, 0, 0]))
diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py
index dbc4b099..645019b6 100644
--- a/tests/modules/scoring/test_description.py
+++ b/tests/modules/scoring/test_description.py
@@ -3,6 +3,7 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import DescriptionScorer
+from tests.conftest import setup_environment
 
 
 @pytest.mark.parametrize(
@@ -12,11 +13,11 @@
         ([[0.2, 0.3, 0.2], [0.2, 0.3, 0.2]], False),
     ],
 )
-def test_description_scorer(setup_environment, dataset, expected_prediction, multilabel):
-    db_dir, dump_dir, logs_dir = setup_environment
+def test_description_scorer(dataset, expected_prediction, multilabel):
+    db_dir, dump_dir, logs_dir = setup_environment()
     data_handler = DataHandler(dataset, force_multilabel=multilabel)
 
-    scorer = DescriptionScorer(model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir(), temperature=0.3)
+    scorer = DescriptionScorer(model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir, temperature=0.3)
 
     scorer.fit(data_handler.utterances_train, data_handler.labels_train, data_handler.label_description)
     assert scorer.description_vectors.shape[0] == len(data_handler.label_description)
diff --git a/tests/modules/scoring/test_dnnc.py b/tests/modules/scoring/test_dnnc.py
index d01d3bba..efadda20 100644
--- a/tests/modules/scoring/test_dnnc.py
+++ b/tests/modules/scoring/test_dnnc.py
@@ -3,12 +3,13 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import DNNCScorer
+from tests.conftest import setup_environment
 
 
 @pytest.mark.xfail(reason="This test is failing on windows, because have different score")
 @pytest.mark.parametrize(("train_head", "pred_score"), [(True, 1), (False, 0.5)])
-def test_base_dnnc(setup_environment, dataset, train_head, pred_score):
-    db_dir, dump_dir, logs_dir = setup_environment
+def test_base_dnnc(dataset, train_head, pred_score):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     data_handler = DataHandler(dataset)
 
diff --git a/tests/modules/scoring/test_knn.py b/tests/modules/scoring/test_knn.py
index 02797a3d..f2be73ab 100644
--- a/tests/modules/scoring/test_knn.py
+++ b/tests/modules/scoring/test_knn.py
@@ -2,14 +2,15 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import KNNScorer
+from tests.conftest import setup_environment
 
 
-def test_base_knn(setup_environment, dataset):
-    db_dir, dump_dir, logs_dir = setup_environment
+def test_base_knn(dataset):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     data_handler = DataHandler(dataset)
 
-    scorer = KNNScorer(k=3, weights="distance", model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir())
+    scorer = KNNScorer(k=3, weights="distance", model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir)
 
     scorer.fit(data_handler.utterances_train, data_handler.labels_train)
     predictions = scorer.predict(
diff --git a/tests/modules/scoring/test_linear.py b/tests/modules/scoring/test_linear.py
index 8a7fb9be..9ac62182 100644
--- a/tests/modules/scoring/test_linear.py
+++ b/tests/modules/scoring/test_linear.py
@@ -2,10 +2,11 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import LinearScorer
+from tests.conftest import setup_environment
 
 
-def test_base_linear(setup_environment, dataset):
-    get_db_dir, dump_dir, logs_dir = setup_environment
+def test_base_linear(dataset):
+    get_db_dir, dump_dir, logs_dir = setup_environment()
 
     data_handler = DataHandler(dataset)
 
diff --git a/tests/modules/scoring/test_mlknn.py b/tests/modules/scoring/test_mlknn.py
index e74de00a..c76318f1 100644
--- a/tests/modules/scoring/test_mlknn.py
+++ b/tests/modules/scoring/test_mlknn.py
@@ -2,10 +2,11 @@
 
 from autointent.context.data_handler import DataHandler, Dataset
 from autointent.modules.scoring.mlknn.mlknn import MLKnnScorer
+from tests.conftest import setup_environment
 
 
-def test_base_mlknn(setup_environment, dataset):
-    db_dir, dump_dir, logs_dir = setup_environment
+def test_base_mlknn(dataset):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     test_dataset = Dataset.model_validate(
         {
@@ -23,7 +24,7 @@ def test_base_mlknn(setup_environment, dataset):
     )
     data_handler = DataHandler(dataset, test_dataset, force_multilabel=True)
 
-    scorer = MLKnnScorer(db_dir=db_dir(), k=3, model_name="sergeyzh/rubert-tiny-turbo")
+    scorer = MLKnnScorer(db_dir=db_dir, k=3, model_name="sergeyzh/rubert-tiny-turbo")
     scorer.fit(data_handler.utterances_train, data_handler.labels_train)
 
     predictions = scorer.predict_labels(
diff --git a/tests/modules/test_regex.py b/tests/modules/test_regex.py
index 8138651c..e73501cb 100644
--- a/tests/modules/test_regex.py
+++ b/tests/modules/test_regex.py
@@ -4,11 +4,12 @@
 from autointent.context.data_handler import Dataset
 from autointent.metrics import retrieval_hit_rate, scoring_roc_auc
 from autointent.modules import RegExp, VectorDBModule
+from tests.conftest import setup_environment
 
 
 @pytest.mark.xfail(reason="Issues with intent_id")
-def test_base_regex(setup_environment):
-    db_dir, dump_dir, logs_dir = setup_environment
+def test_base_regex():
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     data = {
         "utterances": [
@@ -72,7 +73,7 @@ def test_base_regex(setup_environment):
     context = Context(
         dataset=Dataset.model_validate(data),
         dump_dir=dump_dir,
-        db_dir=db_dir(),
+        db_dir=db_dir,
     )
 
     retrieval_params = {"k": 3, "model_name": "sergeyzh/rubert-tiny-turbo"}
diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py
index cdcbdd4b..62b75c2e 100644
--- a/tests/nodes/conftest.py
+++ b/tests/nodes/conftest.py
@@ -1,7 +1,9 @@
 import pytest
 
 from autointent import Context
+from autointent.configs.optimization_cli import DataConfig, LoggingConfig, VectorIndexConfig
 from autointent.nodes.optimization import NodeOptimizer
+from tests.conftest import setup_environment
 
 
 @pytest.fixture
@@ -68,10 +70,14 @@ def scoring_optimizer_multilabel(context, retrieval_optimizer_multilabel):
 
 
 @pytest.fixture
-def context(setup_environment, dataset):
-    db_dir, dump_dir, logs_dir = setup_environment
+def context(dataset_path):
+    db_dir, dump_dir, logs_dir = setup_environment()
 
     def _context(multilabel: bool):
-        return Context(dataset=dataset, db_dir=db_dir(), dump_dir=dump_dir, force_multilabel=multilabel)
+        res = Context()
+        res.config_data(DataConfig(dataset_path, force_multilabel=multilabel))
+        res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir))
+        res.config_vector_index(VectorIndexConfig(db_dir=db_dir))
+        return res
 
     return _context
diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py
index 6824a5a2..5e0908a5 100644
--- a/tests/pipeline/test_optimization.py
+++ b/tests/pipeline/test_optimization.py
@@ -3,7 +3,6 @@
 
 import pytest
 
-from autointent import Context
 from autointent.configs.optimization_cli import (
     DataConfig,
     LoggingConfig,
@@ -11,9 +10,9 @@
     TaskConfig,
     VectorIndexConfig,
 )
-from autointent.pipeline import PipelineOptimizer
 from autointent.pipeline.optimization.cli_endpoint import main as optimize_pipeline
 from autointent.pipeline.optimization.utils import load_config
+from tests.conftest import setup_environment
 
 ConfigType = Literal["multiclass", "multilabel"]
 
@@ -27,30 +26,12 @@ def _get_config(config_type: ConfigType):
     return _get_config
 
 
-@pytest.mark.parametrize(
-    "config_type",
-    ["multiclass", "multilabel"],
-)
-def test_full_pipeline(setup_environment, get_config, dataset, config_type: ConfigType):
-    db_dir, dump_dir, logs_dir = setup_environment
-
-    context = Context(dataset=dataset, db_dir=db_dir(), dump_dir=dump_dir, force_multilabel=config_type == "multilabel")
-
-    # run optimization
-    search_space_config = get_config(config_type)
-    pipeline = PipelineOptimizer.from_dict_config(search_space_config)
-    pipeline.optimize(context)
-
-    # save results
-    pipeline.dump(logs_dir=logs_dir)
-
-
 @pytest.mark.parametrize(
     "dataset_type",
     ["multiclass", "multilabel", "description"],
 )
-def test_optimization_pipeline_cli(dataset_type, setup_environment):
-    db_dir, dump_dir, logs_dir = setup_environment
+def test_optimization_pipeline_cli(dataset_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
     config = OptimizationConfig(
         data=DataConfig(
             train_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"),

From a118e27a868c1adcfe0d1dd1edb04e2b16b6fc2d Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 13:04:39 +0300
Subject: [PATCH 04/21] fix typing

---
 autointent/context/context.py                 | 9 ++++++++-
 autointent/modules/prediction/base.py         | 4 ++--
 autointent/pipeline/inference/cli_endpoint.py | 2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/autointent/context/context.py b/autointent/context/context.py
index 23186518..1164bf32 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -91,6 +91,9 @@ def dump(self) -> None:
         optimization_results = self.optimization_info.dump_evaluation_results()
 
         logs_dir = self.logging_config.dirpath
+        if logs_dir is None:
+            msg = "something's wrong with LoggingConfig"
+            raise ValueError(msg)
 
         # create appropriate directory
         logs_dir.mkdir(parents=True, exist_ok=True)
@@ -135,7 +138,11 @@ def get_max_length(self) -> int | None:
         return self.vector_index_client.embedder_max_length
 
     def get_dump_dir(self) -> Path:
-        return self.logging_config.dump_dir
+        res = self.logging_config.dump_dir
+        if res is None:
+            msg = "something's wrong with LoggingConfig"
+            raise ValueError(msg)
+        return res
 
     def is_multilabel(self) -> bool:
         return self.data_handler.multilabel
diff --git a/autointent/modules/prediction/base.py b/autointent/modules/prediction/base.py
index 17b8de50..6681c0fe 100644
--- a/autointent/modules/prediction/base.py
+++ b/autointent/modules/prediction/base.py
@@ -52,8 +52,8 @@ def get_prediction_evaluation_data(
     return_scores = scores
     if oos_scores is not None:
         oos_labels = (
-            [[0] * context.get_n_classes()] * len(oos_scores) if context.is_multilabel() else [-1] * len(oos_scores)
-        )  # type: ignore[list-item]
+            [[0] * context.get_n_classes()] * len(oos_scores) if context.is_multilabel() else [-1] * len(oos_scores)  # type: ignore[list-item]
+        )
         labels = np.concatenate([labels, np.array(oos_labels)])
         return_scores = np.concatenate([scores, oos_scores])
 
diff --git a/autointent/pipeline/inference/cli_endpoint.py b/autointent/pipeline/inference/cli_endpoint.py
index 257d5f98..690fe8a5 100644
--- a/autointent/pipeline/inference/cli_endpoint.py
+++ b/autointent/pipeline/inference/cli_endpoint.py
@@ -7,7 +7,7 @@
 import yaml
 
 from autointent.configs.inference_cli import InferenceConfig
-from autointent.pipeline.optimization.utils import NumpyEncoder
+from autointent.context.utils import NumpyEncoder
 
 from .inference_pipeline import InferencePipeline
 

From 0e3fe2a7e4c96228a6f64968ed2da1f3cb0e0e22 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 14:17:19 +0300
Subject: [PATCH 05/21] add `Context.set_datasets` and allow not dumping
 modules

---
 autointent/configs/optimization_cli.py        |  1 +
 autointent/context/context.py                 | 15 +++++-----
 .../context/optimization_info/data_models.py  |  2 +-
 .../nodes/optimization/node_optimizer.py      | 30 +++++++++----------
 4 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py
index 576ba64f..ca1d9471 100644
--- a/autointent/configs/optimization_cli.py
+++ b/autointent/configs/optimization_cli.py
@@ -28,6 +28,7 @@ class LoggingConfig:
     run_name: str | None = None
     dirpath: Path | None = None
     dump_dir: Path | None = None
+    dump_modules: bool = True
 
     def __post_init__(self) -> None:
         self.define_run_name()
diff --git a/autointent/context/context.py b/autointent/context/context.py
index 1164bf32..652751d1 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -14,7 +14,7 @@
     VectorIndexConfig,
 )
 
-from .data_handler import DataAugmenter, DataHandler
+from .data_handler import DataAugmenter, DataHandler, Dataset
 from .optimization_info import OptimizationInfo
 from .utils import NumpyEncoder, load_data
 from .vector_index_client import VectorIndex, VectorIndexClient
@@ -68,6 +68,9 @@ def config_data(self, config: DataConfig, augmentation_config: AugmentationConfi
             augmenter=augmenter,
         )
 
+    def set_datasets(self, train_data: Dataset, val_data: Dataset | None = None) -> None:
+        self.data_handler = DataHandler(dataset=train_data, test_dataset=val_data, random_seed=self.seed)
+
     def get_best_index(self) -> VectorIndex:
         model_name = self.optimization_info.get_best_embedder()
         return self.vector_index_client.get_index(model_name)
@@ -137,12 +140,10 @@ def get_batch_size(self) -> int:
     def get_max_length(self) -> int | None:
         return self.vector_index_client.embedder_max_length
 
-    def get_dump_dir(self) -> Path:
-        res = self.logging_config.dump_dir
-        if res is None:
-            msg = "something's wrong with LoggingConfig"
-            raise ValueError(msg)
-        return res
+    def get_dump_dir(self) -> Path | None:
+        if self.logging_config.dump_modules:
+            return self.logging_config.dump_dir
+        return None
 
     def is_multilabel(self) -> bool:
         return self.data_handler.multilabel
diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py
index c74dc089..8a0e0248 100644
--- a/autointent/context/optimization_info/data_models.py
+++ b/autointent/context/optimization_info/data_models.py
@@ -77,7 +77,7 @@ class Trial(BaseModel):
     module_params: dict[str, Any]
     metric_name: str
     metric_value: float
-    module_dump_dir: str
+    module_dump_dir: str | None
 
 
 class Trials(BaseModel):
diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index 77c94652..26bd2db8 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -10,7 +10,9 @@
 from typing_extensions import Self
 
 from autointent.configs.node import NodeOptimizerConfig
+from autointent.configs.optimization_cli import LoggingConfig
 from autointent.context import Context
+from autointent.context.data_handler import Dataset
 from autointent.modules import Module
 from autointent.modules.prediction.base import get_prediction_evaluation_data
 from autointent.nodes.nodes_info import NODES_INFO
@@ -46,8 +48,12 @@ def fit(self, context: Context) -> None:
                 metric_value = module.score(context, self.node_info.metrics_available[self.metric_name])
 
                 assets = module.get_assets()
-                module_dump_dir = self.get_module_dump_dir(context.get_dump_dir(), module_type, j_combination)
-                module.dump(module_dump_dir)
+
+                dump_dir = context.get_dump_dir()
+
+                if dump_dir is not None:
+                    dump_dir = self.get_module_dump_dir(dump_dir, module_type, j_combination)
+                    module.dump(dump_dir)
 
                 context.optimization_info.log_module_optimization(
                     self.node_info.node_type,
@@ -56,7 +62,7 @@ def fit(self, context: Context) -> None:
                     metric_value,
                     self.metric_name,
                     assets,  # retriever name / scores / predictions
-                    module_dump_dir,
+                    dump_dir,
                 )
 
                 module.clear_cache()
@@ -89,15 +95,9 @@ def module_fit(self, module: Module, context: Context) -> None:
             raise ValueError(msg)
         module.fit(*args)  # type: ignore[arg-type]
 
-    # @overload
-    # def fit(
-    #     self,
-    #     utterances: list[str],
-    #     labels: list[LabelType],
-    #     tags: list[Tag] | None = None,
-    #     label_descriptions: list[str] | None = None,
-    # ) -> None:
-    #     # create context object from given data
-    #     ...
-
-    #     # call fit(context)
+    def fit_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> None:
+        context = Context()
+        context.set_datasets(train_data, val_data)
+        context.config_logs(LoggingConfig(dump_dir=None))
+
+        self.fit(context)

From c4e15d905d38886f12ef67a235788fe34d306ad8 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 17:35:46 +0300
Subject: [PATCH 06/21] implement `PipelineOptimizer.fit_from_dataset`

---
 .../nodes/optimization/node_optimizer.py      |    9 -
 .../optimization/pipeline_optimizer.py        |   12 +
 .../data/test_data.json                       |  366 ++++++
 .../data/train_data.json                      | 1055 +++++++++++++++++
 .../python-node-optimization/testbed.ipynb    |  109 ++
 5 files changed, 1542 insertions(+), 9 deletions(-)
 create mode 100644 experiments/python-node-optimization/data/test_data.json
 create mode 100644 experiments/python-node-optimization/data/train_data.json
 create mode 100644 experiments/python-node-optimization/testbed.ipynb

diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index 26bd2db8..f542fac1 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -10,9 +10,7 @@
 from typing_extensions import Self
 
 from autointent.configs.node import NodeOptimizerConfig
-from autointent.configs.optimization_cli import LoggingConfig
 from autointent.context import Context
-from autointent.context.data_handler import Dataset
 from autointent.modules import Module
 from autointent.modules.prediction.base import get_prediction_evaluation_data
 from autointent.nodes.nodes_info import NODES_INFO
@@ -94,10 +92,3 @@ def module_fit(self, module: Module, context: Context) -> None:
             self._logger.error(msg)
             raise ValueError(msg)
         module.fit(*args)  # type: ignore[arg-type]
-
-    def fit_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> None:
-        context = Context()
-        context.set_datasets(train_data, val_data)
-        context.config_logs(LoggingConfig(dump_dir=None))
-
-        self.fit(context)
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index 5125b8c5..54d9aad1 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -6,7 +6,9 @@
 from hydra.utils import instantiate
 
 from autointent import Context
+from autointent.configs.optimization_cli import EmbedderConfig, LoggingConfig, VectorIndexConfig
 from autointent.configs.pipeline_optimizer import PipelineOptimizerConfig
+from autointent.context.data_handler import Dataset
 from autointent.nodes import NodeOptimizer
 
 
@@ -25,6 +27,16 @@ def optimize(self, context: Context) -> None:
         for node_optimizer in self.nodes:
             node_optimizer.fit(context)
 
+    def optimize_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> Context:
+        context = Context()
+        context.set_datasets(train_data, val_data)
+        context.config_logs(LoggingConfig(dump_dir=None))
+        context.config_vector_index(VectorIndexConfig(), EmbedderConfig())
+
+        self.optimize(context)
+        self.inference_config = context.optimization_info.get_inference_nodes_config()
+        return context
+
 
 def make_report(logs: dict[str, Any], nodes: list[str]) -> str:
     ids = [np.argmax(logs["metrics"][node]) for node in nodes]
diff --git a/experiments/python-node-optimization/data/test_data.json b/experiments/python-node-optimization/data/test_data.json
new file mode 100644
index 00000000..615da45c
--- /dev/null
+++ b/experiments/python-node-optimization/data/test_data.json
@@ -0,0 +1,366 @@
+{
+    "utterances": [
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "can you give me a moderately priced restaurant",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "what area is it in",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "thank you and good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "yes im looking for a traditional restaurant in the expensive price range",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "im trying to find a vegetarian restaurant and i dont care regarding the price range",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "chesterton",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "does it have a television",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "and what is the address and phone number",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "thank you goodbye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "number",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "im looking for a pub with and internet connection",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "price",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "no no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "end of system audio no im looking for a seafood restaurant",
+            "label": [
+                6,
+                7
+            ]
+        },
+        {
+            "text": "ok what is the address phone number and price",
+            "label": [
+                0,
+                10
+            ]
+        },
+        {
+            "text": "yeah lets have you got anything in the mediterranean food in the area",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "what about any other area",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "whats the uh thank you and goodbye",
+            "label": [
+                2,
+                12
+            ]
+        },
+        {
+            "text": "next type of food cherry hinton area",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "can you select me another venue",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "ok thank you goodbye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "noise ah hi i am looking for an",
+            "label": [
+                5
+            ]
+        },
+        {
+            "text": "okay and uh",
+            "label": [
+                0
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "hi im looking for a pub having internet connection and have a tv",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "ok thank you",
+            "label": [
+                0,
+                12
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "what about mediterranean",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "hi im looking for a mediterranean restaurant in the rosemary area",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "alright thank you good bye",
+            "label": [
+                2,
+                12
+            ]
+        },
+        {
+            "text": "ok i need the phone number and the area",
+            "label": [
+                0,
+                10
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "is it in the kings hedge area",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "what about contemporary restaurant",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "hi im looking for a contemporary restaurant and is it should be free",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "is it in the city center jesus christ this is ridiculous",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "is it burger gourmet",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "um is it free",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "hello i am looking for cheap restaurant in addenbrookes area",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "can we start again",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "hi im looking for cafe",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "not spanish food not spanish food fast",
+            "label": [
+                4
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "say again",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "uh cheap or moderate",
+            "label": [
+                4
+            ]
+        },
+        {
+            "text": "id like to start over please",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "i need a pub not bakers",
+            "label": [
+                4,
+                6
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/experiments/python-node-optimization/data/train_data.json b/experiments/python-node-optimization/data/train_data.json
new file mode 100644
index 00000000..2c9ac4cc
--- /dev/null
+++ b/experiments/python-node-optimization/data/train_data.json
@@ -0,0 +1,1055 @@
+{
+    "utterances": [
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "vegetarian",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "what is the eagles address",
+            "label": [
+                6,
+                10
+            ]
+        },
+        {
+            "text": "telephone",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "post code",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "i dont care",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "hi im looking for a pub",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "what is the address phone number and postcode",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "may i have the address and prices",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "price",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "restaurant",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "and whats the post code",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "i want to find a restaurant in kings hedges",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "i would like japanese food",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "yes i would like to know about a restaurant",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "any price range",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "a restaurant in kings hedges",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "uh what are some other eareas",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "i dont care",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "castle hill area",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "moderate",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "ya any part of the avenue is fine",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "im looking for a japanese restaurant",
+            "label": [
+                6
+            ]
+        },
+        {
+            "text": "and what is the price of the venue",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "what is the type of food",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "ok what does it have a television",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "whats the phone number",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "uhm address",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "does it have internet connection",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "does it have a television",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "and the phone number",
+            "label": [
+                10
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "yes please",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "no particular venue",
+            "label": [
+                6,
+                7
+            ]
+        },
+        {
+            "text": "goodbye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "yeah hi uh i want to find an internet connection and must have a tv",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "breathing ok thank you goodbye",
+            "label": [
+                2,
+                12
+            ]
+        },
+        {
+            "text": "thank you goodbye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "next choice",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "thank you good bye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "thank you goodbye",
+            "label": [
+                2
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "how about indian",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "yes im looking for a moderately priced restaurant and it should be in the addenbrookes area",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "no any part cheap food",
+            "label": [
+                6,
+                7
+            ]
+        },
+        {
+            "text": "okay thank you goodbye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "thank you",
+            "label": [
+                12
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "no thank you goodbye",
+            "label": [
+                2,
+                7,
+                12
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "ah hi ah i am looking for a thia restaurant",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "thank you",
+            "label": [
+                12
+            ]
+        },
+        {
+            "text": "yes i am",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "okay what is the",
+            "label": [
+                0
+            ]
+        },
+        {
+            "text": "ok thank you what is the phone number and post code",
+            "label": [
+                0,
+                10,
+                12
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "ok unintelligible",
+            "label": [
+                0
+            ]
+        },
+        {
+            "text": "ok can i get an address phone number and post code please",
+            "label": [
+                0,
+                10
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "ok and a",
+            "label": [
+                0
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "yes i am looking for a restaurant",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "no i want the postcode",
+            "label": [
+                7,
+                10
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "ok whats the address phone number and post code",
+            "label": [
+                0,
+                10
+            ]
+        },
+        {
+            "text": "breathing thank you good bye",
+            "label": [
+                2,
+                12
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "yes i would like to know about a restaurant",
+            "label": [
+                1,
+                6
+            ]
+        },
+        {
+            "text": "yes",
+            "label": [
+                1
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "what else do you have",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "ok thank you goodbye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "is it in the cheap price range",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "no",
+            "label": [
+                7
+            ]
+        },
+        {
+            "text": "ok can you get me the phone number and the what type of food unintelligible",
+            "label": [
+                0,
+                10
+            ]
+        },
+        {
+            "text": "any other pubs",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "clicking thank you good bye",
+            "label": [
+                2,
+                12
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "ok and whats the phone number",
+            "label": [
+                0,
+                10
+            ]
+        },
+        {
+            "text": "hi i am looking for an ex",
+            "label": [
+                5
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "ok thank you good bye",
+            "label": [
+                0,
+                2,
+                12
+            ]
+        },
+        {
+            "text": "is it located in addenbrookes area",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "is it a chinese takeaway restaurant",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "say again",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "is it cheap price range",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "ya hi ah i want to find a fusion restaurant",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "does the bakers have a television",
+            "label": [
+                3,
+                10
+            ]
+        },
+        {
+            "text": "are there any other pubs",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "ah does it have a moderate price range",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "pick a different area",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "breathing do you have anything in newnham area",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "is it located in the castlehill area",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "is it a thia restaurant",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "are there any other options",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "is it in the addenbrookes area",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "how about castle hill",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "hi im looking for a fusion restaurant",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "is it in the city center",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "no is it in fenditton",
+            "label": [
+                3,
+                7
+            ]
+        },
+        {
+            "text": "does it serve vegetarian food",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "ah what about asian food",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "is it in the cheap price range",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "what else do you have",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "do you have any others",
+            "label": [
+                9
+            ]
+        },
+        {
+            "text": "is it in the trumington area",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "noise is it in the moderate price range",
+            "label": [
+                3
+            ]
+        },
+        {
+            "text": "hi im looking for a fusion restaurant",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "how about the fenditton area",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "hi im looking for a cheap restaurant in the fenditton area",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "hi im looking for a restaurant in fenditton",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "hi im looking for a cheap restaurant in the girton area",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "hello",
+            "label": [
+                5
+            ]
+        },
+        {
+            "text": "any kind of food what about any kind of food",
+            "label": [
+                6,
+                9
+            ]
+        },
+        {
+            "text": "hi im looking for an english restaurant",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "hello",
+            "label": [
+                5
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "could you repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "hi im looking for a restaurant in fen ditton",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "hi im looking for a fusion restaurant",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "hi im looking for a restaurnt in the barnwell area",
+            "label": [
+                5,
+                6
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "im sorry could you repeat that",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "could you repeat that please",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "search again",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "repeat",
+            "label": [
+                8
+            ]
+        },
+        {
+            "text": "i didnt ask for a moderate price range i need a pub with internet and tv",
+            "label": [
+                4,
+                6
+            ]
+        },
+        {
+            "text": "may i restart",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "lets start over please",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "not so expensive price range",
+            "label": [
+                4
+            ]
+        },
+        {
+            "text": "less expensive",
+            "label": [
+                4
+            ]
+        },
+        {
+            "text": "um cheap cheap venue not indian",
+            "label": [
+                4,
+                6
+            ]
+        },
+        {
+            "text": "can i start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "lets start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "lets not go to japan",
+            "label": [
+                4
+            ]
+        },
+        {
+            "text": "start over",
+            "label": [
+                11
+            ]
+        },
+        {
+            "text": "less expensive",
+            "label": [
+                4
+            ]
+        },
+        {
+            "text": "not free moderate",
+            "label": [
+                4,
+                6
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb
new file mode 100644
index 00000000..b445de59
--- /dev/null
+++ b/experiments/python-node-optimization/testbed.ipynb
@@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(165, 57)"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from autointent.context.data_handler import Dataset\n",
+    "from autointent.context.utils import load_data\n",
+    "\n",
+    "scoring_dataset = load_data(\"./data/train_data.json\")\n",
+    "prediction_dataset = load_data(\"./data/test_data.json\")\n",
+    "len(scoring_dataset.utterances), len(prediction_dataset.utterances)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from autointent.pipeline.optimization import PipelineOptimizer\n",
+    "\n",
+    "config = {\n",
+    "    \"nodes\": [\n",
+    "        {\n",
+    "            \"node_type\": \"scoring\",\n",
+    "            \"metric\": \"scoring_roc_auc\",\n",
+    "            \"search_space\": [\n",
+    "                {\"module_type\": \"knn\", \"k\": [5, 10], \"weights\": [\"uniform\", \"distance\", \"closest\"], \"model_name\": [\"avsolatorio/GIST-small-Embedding-v0\"]},\n",
+    "                {\"module_type\": \"linear\", \"model_name\": [\"avsolatorio/GIST-small-Embedding-v0\"]},\n",
+    "                # {\n",
+    "                #     \"module_type\": \"dnnc\",\n",
+    "                #     \"cross_encoder_name\": [\"cross-encoder/ms-marco-MiniLM-L-6-v2\", \"avsolatorio/GIST-small-Embedding-v0\"],\n",
+    "                #     \"search_model_name\": [\"avsolatorio/GIST-small-Embedding-v0\"],\n",
+    "                #     \"k\": [1, 3],\n",
+    "                #     \"train_head\": [False, True],\n",
+    "                # },\n",
+    "            ],\n",
+    "        },\n",
+    "        {\n",
+    "            \"node_type\": \"prediction\",\n",
+    "            \"metric\": \"prediction_accuracy\",\n",
+    "            \"search_space\": [\n",
+    "                {\"module_type\": \"threshold\", \"thresh\": [0.5]},\n",
+    "                {\"module_type\": \"tunable\"},\n",
+    "                # {\"module_type\": \"argmax\"},\n",
+    "                # {\"module_type\": \"jinoos\"},\n",
+    "            ],\n",
+    "        },\n",
+    "    ]\n",
+    "}\n",
+    "\n",
+    "pipeline_optimizer = PipelineOptimizer.from_dict_config(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context.dump()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "autointent-D7M6VOhJ-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 109d47e784b89182139f3ff685078e4b8d32d487 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 18:11:42 +0300
Subject: [PATCH 07/21] enable configuration for python api

---
 .../optimization/pipeline_optimizer.py        | 27 ++++++-
 .../python-node-optimization/testbed.ipynb    | 70 ++++++++++++++++++-
 2 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index 54d9aad1..f3fb7726 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -13,10 +13,31 @@
 
 
 class PipelineOptimizer:
-    def __init__(self, nodes: list[NodeOptimizer]) -> None:
+    def __init__(
+        self,
+        nodes: list[NodeOptimizer],
+    ) -> None:
         self._logger = logging.getLogger(__name__)
         self.nodes = nodes
 
+        self.logging_config = LoggingConfig(dump_dir=None)
+        self.vector_index_config = VectorIndexConfig()
+        self.embedder_config = EmbedderConfig()
+
+    def set_config(
+        self,
+        config: LoggingConfig | VectorIndexConfig | EmbedderConfig
+    ) -> None:
+        if isinstance(config, LoggingConfig):
+            self.logging_config = config
+        elif isinstance(config, VectorIndexConfig):
+            self.vector_index_config = config
+        elif isinstance(config, EmbedderConfig):
+            self.embedder_config = config
+        else:
+            msg = "unknown config type"
+            raise TypeError(msg)
+
     @classmethod
     def from_dict_config(cls, config: dict[str, Any]) -> "PipelineOptimizer":
         return instantiate(PipelineOptimizerConfig, **config)  # type: ignore[no-any-return]
@@ -30,8 +51,8 @@ def optimize(self, context: Context) -> None:
     def optimize_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> Context:
         context = Context()
         context.set_datasets(train_data, val_data)
-        context.config_logs(LoggingConfig(dump_dir=None))
-        context.config_vector_index(VectorIndexConfig(), EmbedderConfig())
+        context.config_logs(self.logging_config)
+        context.config_vector_index(self.vector_index_config, self.embedder_config)
 
         self.optimize(context)
         self.inference_config = context.optimization_info.get_inference_nodes_config()
diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb
index b445de59..12962095 100644
--- a/experiments/python-node-optimization/testbed.ipynb
+++ b/experiments/python-node-optimization/testbed.ipynb
@@ -1,5 +1,19 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Simpler Pipeline Optimization Demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load datasets"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -25,9 +39,16 @@
     "len(scoring_dataset.utterances), len(prediction_dataset.utterances)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Search Space"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,18 +87,61 @@
     "pipeline_optimizer = PipelineOptimizer.from_dict_config(config)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Optional] Configure Your Run"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig\n",
+    "from pathlib import Path\n",
+    "\n",
+    "pipeline_optimizer.set_config(LoggingConfig(run_name=\"sweet_cucumber\", dirpath=Path(\".\").resolve(), dump_modules=False))\n",
+    "pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(\"./my_vector_db\").resolve(), device=\"cuda\"))\n",
+    "pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2024-11-05 18:08:17,123] A new study created in memory with name: no-name-5066322d-4fcd-4a17-8699-c3670e71e698\n"
+     ]
+    }
+   ],
    "source": [
     "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save Logs"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [

From 2b0d371e615a85e350ab24c0353fc8457de6e2e3 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 18:14:41 +0300
Subject: [PATCH 08/21] fix typing

---
 autointent/configs/node.py                                | 2 +-
 autointent/context/optimization_info/optimization_info.py | 2 +-
 autointent/nodes/optimization/node_optimizer.py           | 8 +++++---
 autointent/pipeline/optimization/pipeline_optimizer.py    | 5 +----
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/autointent/configs/node.py b/autointent/configs/node.py
index f3c41b99..e811fea6 100644
--- a/autointent/configs/node.py
+++ b/autointent/configs/node.py
@@ -9,7 +9,7 @@ class InferenceNodeConfig:
     node_type: str = MISSING
     module_type: str = MISSING
     module_config: dict[str, Any] = MISSING
-    load_path: str = MISSING
+    load_path: str | None = None
     _target_: str = "autointent.nodes.InferenceNode"
 
 
diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 3cfa2a72..7bee5ea7 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -29,7 +29,7 @@ def log_module_optimization(
         metric_value: float,
         metric_name: str,
         artifact: Artifact,
-        module_dump_dir: str,
+        module_dump_dir: str | None,
     ) -> None:
         """
         Purposes:
diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index f542fac1..897cbf6d 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -50,8 +50,10 @@ def fit(self, context: Context) -> None:
                 dump_dir = context.get_dump_dir()
 
                 if dump_dir is not None:
-                    dump_dir = self.get_module_dump_dir(dump_dir, module_type, j_combination)
-                    module.dump(dump_dir)
+                    module_dump_dir = self.get_module_dump_dir(dump_dir, module_type, j_combination)
+                    module.dump(module_dump_dir)
+                else:
+                    module_dump_dir = None
 
                 context.optimization_info.log_module_optimization(
                     self.node_info.node_type,
@@ -60,7 +62,7 @@ def fit(self, context: Context) -> None:
                     metric_value,
                     self.metric_name,
                     assets,  # retriever name / scores / predictions
-                    dump_dir,
+                    module_dump_dir,
                 )
 
                 module.clear_cache()
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index f3fb7726..d0a521c1 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -24,10 +24,7 @@ def __init__(
         self.vector_index_config = VectorIndexConfig()
         self.embedder_config = EmbedderConfig()
 
-    def set_config(
-        self,
-        config: LoggingConfig | VectorIndexConfig | EmbedderConfig
-    ) -> None:
+    def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig) -> None:
         if isinstance(config, LoggingConfig):
             self.logging_config = config
         elif isinstance(config, VectorIndexConfig):

From d7c4066760440d5420aacd6e1d935eeed426b05c Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Tue, 5 Nov 2024 18:45:12 +0300
Subject: [PATCH 09/21] fix tests

---
 autointent/context/context.py                 |  8 +++-
 .../optimization/pipeline_optimizer.py        |  6 ++-
 .../python-node-optimization/testbed.ipynb    |  2 +-
 tests/pipeline/test_optimization.py           | 45 ++++++++++++++-----
 4 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/autointent/context/context.py b/autointent/context/context.py
index 652751d1..9dabf70e 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -68,8 +68,12 @@ def config_data(self, config: DataConfig, augmentation_config: AugmentationConfi
             augmenter=augmenter,
         )
 
-    def set_datasets(self, train_data: Dataset, val_data: Dataset | None = None) -> None:
-        self.data_handler = DataHandler(dataset=train_data, test_dataset=val_data, random_seed=self.seed)
+    def set_datasets(
+        self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False
+    ) -> None:
+        self.data_handler = DataHandler(
+            dataset=train_data, test_dataset=val_data, random_seed=self.seed, force_multilabel=force_multilabel
+        )
 
     def get_best_index(self) -> VectorIndex:
         model_name = self.optimization_info.get_best_embedder()
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index d0a521c1..3552dad0 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -45,9 +45,11 @@ def optimize(self, context: Context) -> None:
         for node_optimizer in self.nodes:
             node_optimizer.fit(context)
 
-    def optimize_from_dataset(self, train_data: Dataset, val_data: Dataset | None = None) -> Context:
+    def optimize_from_dataset(
+        self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False
+    ) -> Context:
         context = Context()
-        context.set_datasets(train_data, val_data)
+        context.set_datasets(train_data, val_data, force_multilabel)
         context.config_logs(self.logging_config)
         context.config_vector_index(self.vector_index_config, self.embedder_config)
 
diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb
index 12962095..5c0d6842 100644
--- a/experiments/python-node-optimization/testbed.ipynb
+++ b/experiments/python-node-optimization/testbed.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py
index 5e0908a5..4e22ccf9 100644
--- a/tests/pipeline/test_optimization.py
+++ b/tests/pipeline/test_optimization.py
@@ -1,50 +1,71 @@
 import importlib.resources as ires
+from pathlib import Path
 from typing import Literal
 
 import pytest
 
 from autointent.configs.optimization_cli import (
     DataConfig,
+    EmbedderConfig,
     LoggingConfig,
     OptimizationConfig,
     TaskConfig,
     VectorIndexConfig,
 )
+from autointent.pipeline.optimization import PipelineOptimizer
 from autointent.pipeline.optimization.cli_endpoint import main as optimize_pipeline
 from autointent.pipeline.optimization.utils import load_config
 from tests.conftest import setup_environment
 
-ConfigType = Literal["multiclass", "multilabel"]
+TaskType = Literal["multiclass", "multilabel", "description"]
 
 
-@pytest.fixture
-def get_config():
-    def _get_config(config_type: ConfigType):
-        config_path = ires.files("tests.assets.configs").joinpath(f"{config_type}.yaml")
-        return load_config(str(config_path), multilabel=config_type == "multilabel")
+def get_search_space_path(task_type: TaskType):
+    return ires.files("tests.assets.configs").joinpath(f"{task_type}.yaml")
 
-    return _get_config
+
+def get_search_space(task_type: TaskType):
+    path = get_search_space_path(task_type)
+    return load_config(str(path), multilabel=task_type == "multilabel")
+
+
+@pytest.mark.parametrize(
+    "task_type",
+    ["multiclass", "multilabel", "description"],
+)
+def test_no_context_optimization(dataset, task_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    search_space = get_search_space(task_type)
+
+    pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)
+
+    pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False))
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu"))
+    pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
+
+    context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
+    context.dump()
 
 
 @pytest.mark.parametrize(
-    "dataset_type",
+    "task_type",
     ["multiclass", "multilabel", "description"],
 )
-def test_optimization_pipeline_cli(dataset_type):
+def test_optimization_pipeline_cli(task_type):
     db_dir, dump_dir, logs_dir = setup_environment()
     config = OptimizationConfig(
         data=DataConfig(
             train_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"),
-            force_multilabel=(dataset_type == "multilabel"),
+            force_multilabel=(task_type == "multilabel"),
         ),
         task=TaskConfig(
-            search_space_path=ires.files("tests.assets.configs").joinpath(f"{dataset_type}.yaml"),
+            search_space_path=get_search_space_path(task_type),
         ),
         vector_index=VectorIndexConfig(
             device="cpu",
         ),
         logs=LoggingConfig(
-            dirpath=logs_dir,
+            dirpath=Path(logs_dir),
         ),
     )
     optimize_pipeline(config)

From d305bb5dd574d238868bab25d1a2326484d37224 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Wed, 6 Nov 2024 11:56:11 +0300
Subject: [PATCH 10/21] add `clear_ram` option

---
 autointent/configs/optimization_cli.py               |  3 ++-
 autointent/context/context.py                        |  3 +++
 autointent/context/optimization_info/data_models.py  | 12 ++++++++++++
 .../context/optimization_info/optimization_info.py   |  9 ++++++++-
 autointent/nodes/optimization/node_optimizer.py      |  8 +++++---
 5 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py
index ca1d9471..2d3ea290 100644
--- a/autointent/configs/optimization_cli.py
+++ b/autointent/configs/optimization_cli.py
@@ -28,7 +28,8 @@ class LoggingConfig:
     run_name: str | None = None
     dirpath: Path | None = None
     dump_dir: Path | None = None
-    dump_modules: bool = True
+    dump_modules: bool = False
+    clear_ram: bool = True
 
     def __post_init__(self) -> None:
         self.define_run_name()
diff --git a/autointent/context/context.py b/autointent/context/context.py
index 9dabf70e..dcd46e46 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -154,3 +154,6 @@ def is_multilabel(self) -> bool:
 
     def get_n_classes(self) -> int:
         return self.data_handler.n_classes
+
+    def is_ram_to_clear(self) -> bool:
+        return self.logging_config.clear_ram
diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py
index 8a0e0248..83e10931 100644
--- a/autointent/context/optimization_info/data_models.py
+++ b/autointent/context/optimization_info/data_models.py
@@ -4,6 +4,8 @@
 from numpy.typing import NDArray
 from pydantic import BaseModel, ConfigDict, Field
 
+# from autointent.modules.base import str
+
 
 class Artifact(BaseModel): ...
 
@@ -115,3 +117,13 @@ def get_best_trial_idx(self, node_type: str) -> int | None:
 
     def set_best_trial_idx(self, node_type: str, idx: int) -> None:
         setattr(self, validate_node_name(node_type), idx)
+
+
+class ModulesList(BaseModel):
+    regexp: list[str] = []
+    retrieval: list[str] = []
+    scoring: list[str] = []
+    prediction: list[str] = []
+
+    def get(self, node_type: str) -> list[str]:
+        return getattr(self, node_type)
diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 7bee5ea7..027cc2de 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -4,8 +4,9 @@
 from numpy.typing import NDArray
 
 from autointent.configs.node import InferenceNodeConfig
+# from autointent.modules.base import Module
 
-from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
+from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
 from .logger import get_logger
 
 
@@ -20,6 +21,7 @@ def __init__(self) -> None:
         self.artifacts = Artifacts()
         self.trials = Trials()
         self._trials_best_ids = TrialsIds()
+        self.modules = ModulesList()
 
     def log_module_optimization(
         self,
@@ -30,6 +32,7 @@ def log_module_optimization(
         metric_name: str,
         artifact: Artifact,
         module_dump_dir: str | None,
+        module = None,
     ) -> None:
         """
         Purposes:
@@ -48,6 +51,10 @@ def log_module_optimization(
         self.trials.add_trial(node_type, trial)
         self._logger.info(trial.model_dump())
 
+        # save module
+        if module is not None:
+            self.modules.get(node_type).append(module)
+
         # save artifact
         self.artifacts.add_artifact(node_type, artifact)
 
diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index 897cbf6d..b281e01b 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -63,11 +63,13 @@ def fit(self, context: Context) -> None:
                     self.metric_name,
                     assets,  # retriever name / scores / predictions
                     module_dump_dir,
+                    module=module if not context.is_ram_to_clear() else None
                 )
 
-                module.clear_cache()
-                gc.collect()
-                torch.cuda.empty_cache()
+                if context.is_ram_to_clear():
+                    module.clear_cache()
+                    gc.collect()
+                    torch.cuda.empty_cache()
 
         self._logger.info("%s node optimization is finished!", self.node_info.node_type)
 

From d648849efb406870af80c0d13b1f76875ec2500b Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Wed, 6 Nov 2024 13:14:39 +0300
Subject: [PATCH 11/21] infering modules from ram after optimization

---
 autointent/context/context.py                 |   4 +
 .../optimization_info/optimization_info.py    |  13 +-
 autointent/nodes/inference/inference_node.py  |  19 ++-
 .../pipeline/inference/inference_pipeline.py  |  25 ++-
 .../python-node-optimization/testbed.ipynb    | 161 ++++++++++++++++--
 5 files changed, 191 insertions(+), 31 deletions(-)

diff --git a/autointent/context/context.py b/autointent/context/context.py
index dcd46e46..4e67a614 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -157,3 +157,7 @@ def get_n_classes(self) -> int:
 
     def is_ram_to_clear(self) -> bool:
         return self.logging_config.clear_ram
+
+    def has_saved_modules(self) -> bool:
+        node_types = ["regexp", "retrieval", "scoring", "prediction"]
+        return any(len(self.optimization_info.modules.get(nt)) > 0 for nt in node_types)
diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 027cc2de..b96d831b 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -4,8 +4,8 @@
 from numpy.typing import NDArray
 
 from autointent.configs.node import InferenceNodeConfig
-# from autointent.modules.base import Module
 
+# from autointent.modules.base import Module
 from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
 from .logger import get_logger
 
@@ -116,3 +116,14 @@ def get_inference_nodes_config(self) -> list[InferenceNodeConfig]:
                 )
             )
         return res
+
+    def _get_best_module(self, node_type: str):
+        idx = self._get_best_trial_idx(node_type)
+        if idx is not None:
+            return self.modules.get(node_type)[idx]
+        return None
+
+    def get_best_modules(self):
+        node_types = ["regexp", "retrieval", "scoring", "prediction"]
+        res = {nt: self._get_best_module(nt) for nt in node_types}
+        return {nt: m for nt, m in res.items() if m is not None}
diff --git a/autointent/nodes/inference/inference_node.py b/autointent/nodes/inference/inference_node.py
index 9549f576..361fd92a 100644
--- a/autointent/nodes/inference/inference_node.py
+++ b/autointent/nodes/inference/inference_node.py
@@ -1,22 +1,25 @@
 import gc
-from typing import Any
 
 import torch
-from hydra.utils import instantiate
 
 from autointent.configs.node import InferenceNodeConfig
+from autointent.modules.base import Module
 from autointent.nodes.nodes_info import NODES_INFO
 
 
 class InferenceNode:
-    def __init__(self, node_type: str, module_type: str, module_config: dict[str, Any], load_path: str) -> None:
-        self.node_info = NODES_INFO[node_type]
-        self.module = self.node_info.modules_available[module_type](**module_config)
-        self.module.load(load_path)
+    def __init__(self, module: Module, node_type: str) -> None:
+        self.module = module
+        self.node_type = node_type
 
     @classmethod
-    def from_dict_config(cls, config: dict[str, Any]) -> "InferenceNode":
-        return instantiate(InferenceNodeConfig, **config)  # type: ignore[no-any-return]
+    def from_config(
+        cls, config: InferenceNodeConfig
+    ) -> "InferenceNode":
+        node_info = NODES_INFO[config.node_type]
+        module = node_info.modules_available[config.module_type](**config.module_config)
+        module.load(config.load_path)
+        return cls(module, config.node_type)
 
     def clear_cache(self) -> None:
         self.module.clear_cache()
diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py
index abcd2fda..372ae729 100644
--- a/autointent/pipeline/inference/inference_pipeline.py
+++ b/autointent/pipeline/inference/inference_pipeline.py
@@ -1,19 +1,17 @@
-from typing import Any
-
-from hydra.utils import instantiate
-
-from autointent.configs.inference_pipeline import InferencePipelineConfig
+from autointent.configs.node import InferenceNodeConfig
+from autointent.context import Context
 from autointent.custom_types import LabelType
 from autointent.nodes.inference import InferenceNode
 
 
 class InferencePipeline:
     def __init__(self, nodes: list[InferenceNode]) -> None:
-        self.nodes = {node.node_info.node_type: node for node in nodes}
+        self.nodes = {n.node_type: n for n in nodes}
 
     @classmethod
-    def from_dict_config(cls, config: dict[str, Any]) -> "InferencePipeline":
-        return instantiate(InferencePipelineConfig, **config)  # type: ignore[no-any-return]
+    def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> None:
+        nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs]
+        return cls(nodes)
 
     def predict(self, utterances: list[str]) -> list[LabelType]:
         scores = self.nodes["scoring"].module.predict(utterances)
@@ -21,3 +19,14 @@ def predict(self, utterances: list[str]) -> list[LabelType]:
 
     def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
         pass
+
+    @classmethod
+    def from_context(cls, context: Context) -> "InferencePipeline":
+        if not context.has_saved_modules():
+            config = context.optimization_info.get_inference_nodes_config()
+            return cls.from_config(config)
+        nodes = [
+            InferenceNode(module, node_type)
+            for node_type, module in context.optimization_info.get_best_modules().items()
+        ]
+        return InferencePipeline(nodes)
diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb
index 5c0d6842..95cc6649 100644
--- a/experiments/python-node-optimization/testbed.ipynb
+++ b/experiments/python-node-optimization/testbed.ipynb
@@ -1,5 +1,15 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -11,12 +21,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load datasets"
+    "## Replicate full-fledged optimization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load datasets"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -25,7 +42,7 @@
        "(165, 57)"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -43,12 +60,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Define Search Space"
+    "### Define Search Space"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,19 +108,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## [Optional] Configure Your Run"
+    "### [Optional] Configure Your Run"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig\n",
     "from pathlib import Path\n",
     "\n",
-    "pipeline_optimizer.set_config(LoggingConfig(run_name=\"sweet_cucumber\", dirpath=Path(\".\").resolve(), dump_modules=False))\n",
+    "pipeline_optimizer.set_config(LoggingConfig(run_name=\"sweet_cucumber\", dirpath=Path.cwd(), dump_modules=True, clear_ram=True))\n",
     "pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(\"./my_vector_db\").resolve(), device=\"cuda\"))\n",
     "pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))"
    ]
@@ -112,41 +129,157 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Run Optimization"
+    "### Run Optimization"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[I 2024-11-05 18:08:17,123] A new study created in memory with name: no-name-5066322d-4fcd-4a17-8699-c3670e71e698\n"
+      "[I 2024-11-06 13:10:03,339] A new study created in memory with name: no-name-85c71fe7-cc94-448b-a9a0-46470688fb6b\n"
      ]
     }
    ],
    "source": [
-    "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test"
+    "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Save Logs"
+    "### Save Logs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "context.dump()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from autointent.pipeline.inference import InferencePipeline\n",
+    "\n",
+    "inference_pipeline = InferencePipeline.from_context(context)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inference_pipeline.predict([\"hello world\", \"what is the eagles address\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## No modules dumping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    }
+   ],
+   "source": [
+    "! rm -rf sweet_cucumber*"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_optimizer.set_config(LoggingConfig(dump_modules=False, clear_ram=False))\n",
+    "pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(\"./my_vector_db\").resolve(), device=\"cuda\"))\n",
+    "pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_pipeline = InferencePipeline.from_context(context)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inference_pipeline.predict([\"hello world\", \"what is the eagles address\"])"
+   ]
   }
  ],
  "metadata": {

From e7d0fbd86942116dcfe467ba0eaa1733c4756700 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Wed, 6 Nov 2024 13:34:09 +0300
Subject: [PATCH 12/21] minor change

---
 .../pipeline/inference/inference_pipeline.py  |  2 +-
 .../python-node-optimization/testbed.ipynb    | 54 +++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py
index 372ae729..c6926426 100644
--- a/autointent/pipeline/inference/inference_pipeline.py
+++ b/autointent/pipeline/inference/inference_pipeline.py
@@ -9,7 +9,7 @@ def __init__(self, nodes: list[InferenceNode]) -> None:
         self.nodes = {n.node_type: n for n in nodes}
 
     @classmethod
-    def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> None:
+    def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "InferencePipeline":
         nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs]
         return cls(nodes)
 
diff --git a/experiments/python-node-optimization/testbed.ipynb b/experiments/python-node-optimization/testbed.ipynb
index 95cc6649..746eb49f 100644
--- a/experiments/python-node-optimization/testbed.ipynb
+++ b/experiments/python-node-optimization/testbed.ipynb
@@ -141,7 +141,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[I 2024-11-06 13:10:03,339] A new study created in memory with name: no-name-85c71fe7-cc94-448b-a9a0-46470688fb6b\n"
+      "[I 2024-11-06 13:31:52,764] A new study created in memory with name: no-name-4b9b4940-4128-414d-8733-3236f7935735\n"
      ]
     }
    ],
@@ -165,16 +165,64 @@
     "context.dump()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_config = context.optimization_info.get_inference_nodes_config()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run Inference from file system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from autointent.pipeline.inference import InferencePipeline\n",
+    "\n",
+    "inference_pipeline = InferencePipeline.from_config(inference_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
+       "       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inference_pipeline.predict([\"hello world\", \"what is the eagles address\"])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Run Inference"
+    "## Run Inference from context [but from file system]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [

From 975c8dfa50ede43c81367e094b2ee063cfd16877 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Wed, 6 Nov 2024 13:46:27 +0300
Subject: [PATCH 13/21] fix unintended `runs` directory creation

---
 autointent/configs/optimization_cli.py                    | 1 -
 autointent/context/optimization_info/optimization_info.py | 3 +--
 autointent/nodes/inference/inference_node.py              | 4 +---
 autointent/nodes/optimization/node_optimizer.py           | 2 +-
 autointent/pipeline/optimization/utils/__init__.py        | 3 +--
 autointent/pipeline/optimization/utils/cli.py             | 8 --------
 6 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py
index 2d3ea290..080b602c 100644
--- a/autointent/configs/optimization_cli.py
+++ b/autointent/configs/optimization_cli.py
@@ -46,7 +46,6 @@ def define_dirpath(self) -> None:
         if self.run_name is None:
             raise ValueError
         self.dirpath = dirpath / self.run_name
-        self.dirpath.mkdir(parents=True)
 
     def define_dump_dir(self) -> None:
         if self.dump_dir is None:
diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index b96d831b..bce20945 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -5,7 +5,6 @@
 
 from autointent.configs.node import InferenceNodeConfig
 
-# from autointent.modules.base import Module
 from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
 from .logger import get_logger
 
@@ -32,7 +31,7 @@ def log_module_optimization(
         metric_name: str,
         artifact: Artifact,
         module_dump_dir: str | None,
-        module = None,
+        module=None,
     ) -> None:
         """
         Purposes:
diff --git a/autointent/nodes/inference/inference_node.py b/autointent/nodes/inference/inference_node.py
index 361fd92a..705fb526 100644
--- a/autointent/nodes/inference/inference_node.py
+++ b/autointent/nodes/inference/inference_node.py
@@ -13,9 +13,7 @@ def __init__(self, module: Module, node_type: str) -> None:
         self.node_type = node_type
 
     @classmethod
-    def from_config(
-        cls, config: InferenceNodeConfig
-    ) -> "InferenceNode":
+    def from_config(cls, config: InferenceNodeConfig) -> "InferenceNode":
         node_info = NODES_INFO[config.node_type]
         module = node_info.modules_available[config.module_type](**config.module_config)
         module.load(config.load_path)
diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index b281e01b..2a483556 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -63,7 +63,7 @@ def fit(self, context: Context) -> None:
                     self.metric_name,
                     assets,  # retriever name / scores / predictions
                     module_dump_dir,
-                    module=module if not context.is_ram_to_clear() else None
+                    module=module if not context.is_ram_to_clear() else None,
                 )
 
                 if context.is_ram_to_clear():
diff --git a/autointent/pipeline/optimization/utils/__init__.py b/autointent/pipeline/optimization/utils/__init__.py
index e0aa17b6..2a948bd8 100644
--- a/autointent/pipeline/optimization/utils/__init__.py
+++ b/autointent/pipeline/optimization/utils/__init__.py
@@ -1,6 +1,5 @@
-from .cli import get_logs_dir, load_config
+from .cli import load_config
 
 __all__ = [
     "load_config",
-    "get_logs_dir",
 ]
diff --git a/autointent/pipeline/optimization/utils/cli.py b/autointent/pipeline/optimization/utils/cli.py
index 2cd4e911..7ceb8967 100644
--- a/autointent/pipeline/optimization/utils/cli.py
+++ b/autointent/pipeline/optimization/utils/cli.py
@@ -6,14 +6,6 @@
 import yaml
 
 
-def get_logs_dir(run_name: str, logs_dir: Path | None = None) -> Path:
-    if logs_dir is None:
-        logs_dir = Path.cwd()
-    res = logs_dir / run_name
-    res.mkdir(parents=True)
-    return res
-
-
 def load_config(config_path: str | Path | None, multilabel: bool, logger: Logger | None = None) -> dict[str, Any]:
     """load config from the given path or load default config which is distributed along with the autointent package"""
     if config_path is not None:

From 378e582c977b778ca869d192c3f974929b9fcabb Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Wed, 6 Nov 2024 14:03:23 +0300
Subject: [PATCH 14/21] add `save_db` option

---
 autointent/configs/optimization_cli.py        |  1 +
 .../context/vector_index_client/cache.py      | 97 +------------------
 .../vector_index_client.py                    |  3 +
 .../optimization/pipeline_optimizer.py        |  2 +
 4 files changed, 8 insertions(+), 95 deletions(-)

diff --git a/autointent/configs/optimization_cli.py b/autointent/configs/optimization_cli.py
index 080b602c..e9f74cfd 100644
--- a/autointent/configs/optimization_cli.py
+++ b/autointent/configs/optimization_cli.py
@@ -58,6 +58,7 @@ def define_dump_dir(self) -> None:
 class VectorIndexConfig:
     db_dir: Path | None = None
     device: str = "cpu"
+    save_db: bool = False
 
 
 @dataclass
diff --git a/autointent/context/vector_index_client/cache.py b/autointent/context/vector_index_client/cache.py
index 843d2de2..663dc79e 100644
--- a/autointent/context/vector_index_client/cache.py
+++ b/autointent/context/vector_index_client/cache.py
@@ -1,71 +1,6 @@
-import json
-import logging
-import shutil
-from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from uuid import uuid4
 
-from appdirs import user_cache_dir, user_config_dir
-
-
-def get_logger() -> logging.Logger:
-    logger = logging.getLogger("my_logger")
-
-    logger.setLevel(logging.INFO)
-
-    ch = logging.StreamHandler()
-    ch.setLevel(logging.INFO)
-
-    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    ch.setFormatter(formatter)
-
-    logger.addHandler(ch)
-
-    return logger
-
-
-@dataclass
-class ChromaConfig:
-    cache_directories: list[str] = field(default_factory=list)
-
-
-def get_chroma_cache_dir() -> Path:
-    """Get system's default cache dir."""
-    cache_dir = user_cache_dir("autointent")
-    return Path(cache_dir) / "chroma"
-
-
-def get_chroma_config_path() -> Path:
-    """Get system's default config dir."""
-    config_dir = user_config_dir("autointent")
-    return Path(config_dir) / "chromadb.json"
-
-
-def read_chroma_config() -> ChromaConfig:
-    path = get_chroma_config_path()
-    if not path.exists():
-        return ChromaConfig()
-    with path.open() as file:
-        return ChromaConfig(**json.load(file))
-
-
-def write_chroma_config(config: ChromaConfig) -> None:
-    path = get_chroma_config_path()
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("w") as file:
-        json.dump(asdict(config), file, ensure_ascii=False, indent=4)
-
-
-def add_cache_directory(directory: str) -> None:
-    """Save path into chroma config in order to remove it from cache later."""
-    chroma_config = read_chroma_config()
-
-    directories = set(chroma_config.cache_directories)
-    directories.add(directory)
-    chroma_config.cache_directories = sorted(directories)
-
-    write_chroma_config(chroma_config)
-
 
 def get_db_dir(db_dir: str | Path | None = None) -> Path:
     """
@@ -74,36 +9,8 @@ def get_db_dir(db_dir: str | Path | None = None) -> Path:
     Save path into user config in order to remove it from cache later.
     """
 
-    db_dir = get_chroma_cache_dir() / str(uuid4()) if db_dir is None else Path(db_dir)
+    root = Path(db_dir) if db_dir is not None else Path.cwd()
+    db_dir = root / "vector_db" / str(uuid4()) if db_dir is None else Path(db_dir)
     db_dir.mkdir(parents=True, exist_ok=True)
-    add_cache_directory(str(db_dir.resolve()))
 
     return db_dir
-
-
-def clear_chroma_cache() -> None:
-    # TODO: test on all platforms
-    logger = get_logger()
-    chroma_config = read_chroma_config()
-    for cache_dirs in chroma_config.cache_directories:
-        if Path(cache_dirs).exists():
-            shutil.rmtree(cache_dirs)
-            logger.info("cleared vector index at %s", cache_dirs)
-        else:
-            logger.error("vector index at %s not found", cache_dirs)
-        chroma_config.cache_directories.remove(cache_dirs)
-    write_chroma_config(chroma_config)
-
-
-def clear_specific_cache(directory: str) -> None:
-    """TODO test this code"""
-    chroma_config = read_chroma_config()
-    if directory in chroma_config.cache_directories:
-        try:
-            shutil.rmtree(directory)
-            chroma_config.cache_directories.remove(directory)
-            write_chroma_config(chroma_config)
-        except OSError:
-            pass
-    else:
-        pass
diff --git a/autointent/context/vector_index_client/vector_index_client.py b/autointent/context/vector_index_client/vector_index_client.py
index 19a79816..1cb83801 100644
--- a/autointent/context/vector_index_client/vector_index_client.py
+++ b/autointent/context/vector_index_client/vector_index_client.py
@@ -108,6 +108,9 @@ def get_index(self, model_name: str) -> VectorIndex:
     def exists(self, model_name: str) -> bool:
         return self._get_index_dirpath(model_name) is not None
 
+    def delete_db(self) -> None:
+        shutil.rmtree(self.db_dir)
+
 
 class NonExistingIndexError(Exception):
     def __init__(self, message: str = "non-existent index was requested") -> None:
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index 3552dad0..f391e77d 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -44,6 +44,8 @@ def optimize(self, context: Context) -> None:
         self._logger.info("starting pipeline optimization...")
         for node_optimizer in self.nodes:
             node_optimizer.fit(context)
+        if not context.vector_index_config.save_db:
+            context.vector_index_client.delete_db()
 
     def optimize_from_dataset(
         self, train_data: Dataset, val_data: Dataset | None = None, force_multilabel: bool = False

From 8c2eaff902601341aacf3097936bb604dd8bc508 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 6 Nov 2024 22:41:50 +0300
Subject: [PATCH 15/21] fix circular imports

---
 .../context/optimization_info/data_models.py  | 10 ------
 .../optimization_info/optimization_info.py    | 35 +++++++++++++------
 autointent/modules/prediction/argmax.py       |  1 -
 autointent/nodes/inference/inference_node.py  |  3 +-
 .../pipeline/inference/inference_pipeline.py  | 10 ++----
 5 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py
index 0f513770..611387a4 100644
--- a/autointent/context/optimization_info/data_models.py
+++ b/autointent/context/optimization_info/data_models.py
@@ -117,13 +117,3 @@ def get_best_trial_idx(self, node_type: str) -> int | None:
 
     def set_best_trial_idx(self, node_type: str, idx: int) -> None:
         setattr(self, validate_node_name(node_type), idx)
-
-
-class ModulesList(BaseModel):
-    regexp: list[str] = []
-    retrieval: list[str] = []
-    scoring: list[str] = []
-    prediction: list[str] = []
-
-    def get(self, node_type: str) -> list[str]:
-        return getattr(self, node_type)
diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 744b7e3d..467ada4f 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -1,15 +1,31 @@
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 from numpy.typing import NDArray
+from pydantic import BaseModel
 
 from autointent.configs.node import InferenceNodeConfig
 from autointent.custom_types import NODE_TYPES, NodeType
-from autointent.logger import get_logger
 
-from .data_models import Artifact, Artifacts, ModulesList, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
+from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
 from .logger import get_logger
 
+if TYPE_CHECKING:
+    from autointent.modules import Module
+
+
+class ModulesList(BaseModel):
+    regexp: list["Module"] = []
+    retrieval: list["Module"] = []
+    scoring: list["Module"] = []
+    prediction: list["Module"] = []
+
+    def get(self, node_type: str) -> list["Module"]:
+        return getattr(self, node_type)  # type: ignore[no-any-return]
+
+    def add_module(self, node_type: str, module: "Module") -> None:
+        self.get(node_type).append(module)
+
 
 class OptimizationInfo:
     """TODO continous IO with file system (to be able to restore the state of optimization)"""
@@ -33,7 +49,7 @@ def log_module_optimization(
         metric_name: str,
         artifact: Artifact,
         module_dump_dir: str | None,
-        module=None,
+        module: "Module | None" = None,
     ) -> None:
         """
         Purposes:
@@ -53,8 +69,8 @@ def log_module_optimization(
         self._logger.info(trial.model_dump())
 
         # save module
-        if module is not None:
-            self.modules.get(node_type).append(module)
+        if module:
+            self.modules.add_module(node_type, module)
 
         # save artifact
         self.artifacts.add_artifact(node_type, artifact)
@@ -114,13 +130,12 @@ def get_inference_nodes_config(self) -> list[InferenceNodeConfig]:
             )
         return res
 
-    def _get_best_module(self, node_type: str):
+    def _get_best_module(self, node_type: str) -> "Module | None":
         idx = self._get_best_trial_idx(node_type)
         if idx is not None:
             return self.modules.get(node_type)[idx]
         return None
 
-    def get_best_modules(self):
-        node_types = ["regexp", "retrieval", "scoring", "prediction"]
-        res = {nt: self._get_best_module(nt) for nt in node_types}
+    def get_best_modules(self) -> dict[str, "Module"]:
+        res = {nt: self._get_best_module(nt) for nt in NODE_TYPES}
         return {nt: m for nt, m in res.items() if m is not None}
diff --git a/autointent/modules/prediction/argmax.py b/autointent/modules/prediction/argmax.py
index 5b76a665..57c6baf4 100644
--- a/autointent/modules/prediction/argmax.py
+++ b/autointent/modules/prediction/argmax.py
@@ -18,7 +18,6 @@ class ArgmaxPredictor(PredictionModule):
     def __init__(self) -> None:
         pass
 
-
     @classmethod
     def from_context(cls, context: Context) -> Self:
         return cls()
diff --git a/autointent/nodes/inference/inference_node.py b/autointent/nodes/inference/inference_node.py
index 705fb526..f73da944 100644
--- a/autointent/nodes/inference/inference_node.py
+++ b/autointent/nodes/inference/inference_node.py
@@ -16,7 +16,8 @@ def __init__(self, module: Module, node_type: str) -> None:
     def from_config(cls, config: InferenceNodeConfig) -> "InferenceNode":
         node_info = NODES_INFO[config.node_type]
         module = node_info.modules_available[config.module_type](**config.module_config)
-        module.load(config.load_path)
+        if config.load_path is not None:
+            module.load(config.load_path)
         return cls(module, config.node_type)
 
     def clear_cache(self) -> None:
diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py
index 5af1d995..9b593345 100644
--- a/autointent/pipeline/inference/inference_pipeline.py
+++ b/autointent/pipeline/inference/inference_pipeline.py
@@ -1,18 +1,12 @@
-from typing import Any
-
-from hydra.utils import instantiate
-
-from autointent.configs.inference_pipeline import InferencePipelineConfig
-from autointent.custom_types import LabelType, NodeType
 from autointent.configs.node import InferenceNodeConfig
 from autointent.context import Context
-from autointent.custom_types import LabelType
+from autointent.custom_types import LabelType, NodeType
 from autointent.nodes.inference import InferenceNode
 
 
 class InferencePipeline:
     def __init__(self, nodes: list[InferenceNode]) -> None:
-        self.nodes = {node.node_info.node_type: node for node in nodes}
+        self.nodes = {node.node_type: node for node in nodes}
 
     @classmethod
     def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "InferencePipeline":

From 322340bec35624b0350ea693e1eff49e21575992 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Fri, 8 Nov 2024 23:11:19 +0300
Subject: [PATCH 16/21] fix tests

---
 .../optimization_info/optimization_info.py    | 13 +++++----
 autointent/pipeline/inference/cli_endpoint.py |  3 +-
 tests/nodes/conftest.py                       |  2 +-
 tests/nodes/test_predicton.py                 | 29 ++++++++++---------
 tests/nodes/test_retrieval.py                 | 29 ++++++++++---------
 tests/nodes/test_scoring.py                   | 29 ++++++++++---------
 6 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 467ada4f..9040d376 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -1,8 +1,8 @@
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
 from numpy.typing import NDArray
-from pydantic import BaseModel
 
 from autointent.configs.node import InferenceNodeConfig
 from autointent.custom_types import NODE_TYPES, NodeType
@@ -14,11 +14,12 @@
     from autointent.modules import Module
 
 
-class ModulesList(BaseModel):
-    regexp: list["Module"] = []
-    retrieval: list["Module"] = []
-    scoring: list["Module"] = []
-    prediction: list["Module"] = []
+@dataclass
+class ModulesList:
+    regexp: list["Module"] = field(default_factory=list)
+    retrieval: list["Module"] = field(default_factory=list)
+    scoring: list["Module"] = field(default_factory=list)
+    prediction: list["Module"] = field(default_factory=list)
 
     def get(self, node_type: str) -> list["Module"]:
         return getattr(self, node_type)  # type: ignore[no-any-return]
diff --git a/autointent/pipeline/inference/cli_endpoint.py b/autointent/pipeline/inference/cli_endpoint.py
index 690fe8a5..ad833517 100644
--- a/autointent/pipeline/inference/cli_endpoint.py
+++ b/autointent/pipeline/inference/cli_endpoint.py
@@ -29,8 +29,7 @@ def main(cfg: InferenceConfig) -> None:
     logger.debug("Inference config loaded")
 
     # instantiate pipeline
-    pipeline_config = {"nodes": inference_config["nodes_configs"]}
-    pipeline = InferencePipeline.from_dict_config(pipeline_config)
+    pipeline = InferencePipeline.from_config(inference_config["nodes_configs"])
 
     # send data to pipeline
     labels: list[LabelType] = pipeline.predict(data)
diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py
index 62b75c2e..da221e2e 100644
--- a/tests/nodes/conftest.py
+++ b/tests/nodes/conftest.py
@@ -76,7 +76,7 @@ def context(dataset_path):
     def _context(multilabel: bool):
         res = Context()
         res.config_data(DataConfig(dataset_path, force_multilabel=multilabel))
-        res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir))
+        res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True))
         res.config_vector_index(VectorIndexConfig(db_dir=db_dir))
         return res
 
diff --git a/tests/nodes/test_predicton.py b/tests/nodes/test_predicton.py
index 97434448..9c675434 100644
--- a/tests/nodes/test_predicton.py
+++ b/tests/nodes/test_predicton.py
@@ -4,6 +4,7 @@
 import numpy as np
 import torch
 
+from autointent.configs.node import InferenceNodeConfig
 from autointent.nodes import InferenceNode
 from autointent.nodes.optimization import NodeOptimizer
 
@@ -34,13 +35,13 @@ def test_prediction_multiclass(scoring_optimizer_multiclass):
     prediction_optimizer.fit(context)
 
     for trial in context.optimization_info.trials.prediction:
-        config = {
-            "node_type": "prediction",
-            "module_type": trial.module_type,
-            "module_config": trial.module_params,
-            "load_path": trial.module_dump_dir,
-        }
-        node = InferenceNode(**config)
+        config = InferenceNodeConfig(
+            node_type="prediction",
+            module_type=trial.module_type,
+            module_config=trial.module_params,
+            load_path=trial.module_dump_dir,
+        )
+        node = InferenceNode.from_config(config)
         node.module.predict(np.array([[0.27486506, 0.31681463, 0.37459106], [0.2769358, 0.31536099, 0.37366978]]))
         node.module.clear_cache()
         gc.collect()
@@ -65,13 +66,13 @@ def test_prediction_multilabel(scoring_optimizer_multilabel):
     prediction_optimizer.fit(context)
 
     for trial in context.optimization_info.trials.prediction:
-        config = {
-            "node_type": "prediction",
-            "module_type": trial.module_type,
-            "module_config": trial.module_params,
-            "load_path": trial.module_dump_dir,
-        }
-        node = InferenceNode(**config)
+        config = InferenceNodeConfig(
+            node_type="prediction",
+            module_type=trial.module_type,
+            module_config=trial.module_params,
+            load_path=trial.module_dump_dir,
+        )
+        node = InferenceNode.from_config(config)
         node.module.predict(np.array([[0.27486506, 0.31681463, 0.37459106], [0.2769358, 0.31536099, 0.37366978]]))
         node.module.clear_cache()
         gc.collect()
diff --git a/tests/nodes/test_retrieval.py b/tests/nodes/test_retrieval.py
index 92417eb8..5d625692 100644
--- a/tests/nodes/test_retrieval.py
+++ b/tests/nodes/test_retrieval.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from autointent.configs.node import InferenceNodeConfig
 from autointent.nodes import InferenceNode, NodeOptimizer
 
 logger = logging.getLogger(__name__)
@@ -14,13 +15,13 @@ def test_retrieval_multiclass(context):
     retrieval_optimizer.fit(context)
 
     for trial in context.optimization_info.trials.retrieval:
-        config = {
-            "node_type": "retrieval",
-            "module_type": trial.module_type,
-            "module_config": trial.module_params,
-            "load_path": trial.module_dump_dir,
-        }
-        node = InferenceNode(**config)
+        config = InferenceNodeConfig(
+            node_type="retrieval",
+            module_type=trial.module_type,
+            module_config=trial.module_params,
+            load_path=trial.module_dump_dir,
+        )
+        node = InferenceNode.from_config(config)
         labels, distances, texts = node.module.predict(["hello", "card"])
         node.module.clear_cache()
         gc.collect()
@@ -33,13 +34,13 @@ def test_retrieval_multilabel(context):
     retrieval_optimizer.fit(context)
 
     for trial in context.optimization_info.trials.retrieval:
-        config = {
-            "node_type": "retrieval",
-            "module_type": trial.module_type,
-            "module_config": trial.module_params,
-            "load_path": trial.module_dump_dir,
-        }
-        node = InferenceNode(**config)
+        config = InferenceNodeConfig(
+            node_type="retrieval",
+            module_type=trial.module_type,
+            module_config=trial.module_params,
+            load_path=trial.module_dump_dir,
+        )
+        node = InferenceNode.from_config(config)
         labels, distances, texts = node.module.predict(["hello", "card"])
         node.module.clear_cache()
         gc.collect()
diff --git a/tests/nodes/test_scoring.py b/tests/nodes/test_scoring.py
index 79a58ba4..8d037df2 100644
--- a/tests/nodes/test_scoring.py
+++ b/tests/nodes/test_scoring.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from autointent.configs.node import InferenceNodeConfig
 from autointent.nodes import InferenceNode
 from autointent.nodes.optimization import NodeOptimizer
 
@@ -47,13 +48,13 @@ def test_scoring_multiclass(context, retrieval_optimizer_multiclass):
     scoring_optimizer.fit(context)
 
     for trial in context.optimization_info.trials.scoring:
-        config = {
-            "node_type": "scoring",
-            "module_type": trial.module_type,
-            "module_config": trial.module_params,
-            "load_path": trial.module_dump_dir,
-        }
-        node = InferenceNode(**config)
+        config = InferenceNodeConfig(
+            node_type="scoring",
+            module_type=trial.module_type,
+            module_config=trial.module_params,
+            load_path=trial.module_dump_dir,
+        )
+        node = InferenceNode.from_config(config)
         scores = node.module.predict(["hello", "world"])  # noqa: F841
         node.module.clear_cache()
         gc.collect()
@@ -87,13 +88,13 @@ def test_scoring_multilabel(context, retrieval_optimizer_multilabel):
     scoring_optimizer.fit(context)
 
     for trial in context.optimization_info.trials.scoring:
-        config = {
-            "node_type": "scoring",
-            "module_type": trial.module_type,
-            "module_config": trial.module_params,
-            "load_path": trial.module_dump_dir,
-        }
-        node = InferenceNode(**config)
+        config = InferenceNodeConfig(
+            node_type="scoring",
+            module_type=trial.module_type,
+            module_config=trial.module_params,
+            load_path=trial.module_dump_dir,
+        )
+        node = InferenceNode.from_config(config)
         scores = node.module.predict(["hello", "world"])  # noqa: F841
         node.module.clear_cache()
         gc.collect()

From c4873633b3b705932c625e20ed232e6f94ce4ab8 Mon Sep 17 00:00:00 2001
From: Darinka <39233990+Darinochka@users.noreply.github.com>
Date: Sat, 9 Nov 2024 10:47:42 +0300
Subject: [PATCH 17/21] Test/pipeline simpler fitting (#39)

* tess: added inference_test

* test: added inference pipeline cli

* test: fixed device

* test: added optimization tests

* fix `inference_config.yaml` not found error

---------

Co-authored-by: voorhs <ilya_alekseev_2016@list.ru>
---
 .../optimization_info/optimization_info.py    |   2 +-
 autointent/pipeline/inference/cli_endpoint.py |   2 +-
 .../pipeline/inference/inference_pipeline.py  |   8 ++
 tests/conftest.py                             |   4 +-
 tests/pipeline/test_inference.py              | 109 ++++++++++++++++++
 tests/pipeline/test_optimization.py           |  41 +++++++
 6 files changed, 162 insertions(+), 4 deletions(-)
 create mode 100644 tests/pipeline/test_inference.py

diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
index 9040d376..48e22e8c 100644
--- a/autointent/context/optimization_info/optimization_info.py
+++ b/autointent/context/optimization_info/optimization_info.py
@@ -123,7 +123,7 @@ def get_inference_nodes_config(self) -> list[InferenceNodeConfig]:
             trial = self.trials.get_trial(node_type, idx)
             res.append(
                 InferenceNodeConfig(
-                    node_type=node_type,
+                    node_type=node_type.value,
                     module_type=trial.module_type,
                     module_config=trial.module_params,
                     load_path=trial.module_dump_dir,
diff --git a/autointent/pipeline/inference/cli_endpoint.py b/autointent/pipeline/inference/cli_endpoint.py
index ad833517..40f5d6b6 100644
--- a/autointent/pipeline/inference/cli_endpoint.py
+++ b/autointent/pipeline/inference/cli_endpoint.py
@@ -29,7 +29,7 @@ def main(cfg: InferenceConfig) -> None:
     logger.debug("Inference config loaded")
 
     # instantiate pipeline
-    pipeline = InferencePipeline.from_config(inference_config["nodes_configs"])
+    pipeline = InferencePipeline.from_dict_config(inference_config["nodes_configs"])
 
     # send data to pipeline
     labels: list[LabelType] = pipeline.predict(data)
diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py
index 9b593345..39c9d73a 100644
--- a/autointent/pipeline/inference/inference_pipeline.py
+++ b/autointent/pipeline/inference/inference_pipeline.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from autointent.configs.node import InferenceNodeConfig
 from autointent.context import Context
 from autointent.custom_types import LabelType, NodeType
@@ -8,6 +10,12 @@ class InferencePipeline:
     def __init__(self, nodes: list[InferenceNode]) -> None:
         self.nodes = {node.node_type: node for node in nodes}
 
+    @classmethod
+    def from_dict_config(cls, nodes_configs: list[dict[str, Any]]) -> "InferencePipeline":
+        nodes_configs_ = [InferenceNodeConfig(**cfg) for cfg in nodes_configs]
+        nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs_]
+        return cls(nodes)
+
     @classmethod
     def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "InferencePipeline":
         nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs]
diff --git a/tests/conftest.py b/tests/conftest.py
index 08f95300..af217b98 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,8 +7,8 @@
 
 
 def setup_environment() -> tuple[str, str]:
-    logs_dir = ires.files("tests").joinpath("logs")
-    db_dir = logs_dir / "db" / str(uuid4())
+    logs_dir = ires.files("tests").joinpath("logs") / str(uuid4())
+    db_dir = logs_dir / "db"
     dump_dir = logs_dir / "modules_dump"
     return db_dir, dump_dir, logs_dir
 
diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py
new file mode 100644
index 00000000..a04a69c1
--- /dev/null
+++ b/tests/pipeline/test_inference.py
@@ -0,0 +1,109 @@
+import importlib.resources as ires
+from pathlib import Path
+from typing import Literal
+
+import pytest
+
+from autointent.configs.inference_cli import InferenceConfig
+from autointent.configs.optimization_cli import (
+    EmbedderConfig,
+    LoggingConfig,
+    VectorIndexConfig,
+)
+from autointent.pipeline.inference import InferencePipeline
+from autointent.pipeline.inference.cli_endpoint import main as inference_pipeline
+from autointent.pipeline.optimization import PipelineOptimizer
+from autointent.pipeline.optimization.utils import load_config
+from tests.conftest import setup_environment
+
+TaskType = Literal["multiclass", "multilabel", "description"]
+
+
+def get_search_space_path(task_type: TaskType):
+    return ires.files("tests.assets.configs").joinpath(f"{task_type}.yaml")
+
+
+def get_search_space(task_type: TaskType):
+    path = get_search_space_path(task_type)
+    return load_config(str(path), multilabel=task_type == "multilabel")
+
+
+@pytest.mark.parametrize(
+    "task_type",
+    ["multiclass", "multilabel", "description"],
+)
+def test_inference_config(dataset, task_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    search_space = get_search_space(task_type)
+
+    pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)
+
+    pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=True))
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True))
+    pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
+
+    context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
+    inference_config = context.optimization_info.get_inference_nodes_config()
+
+    inference_pipeline = InferencePipeline.from_config(inference_config)
+    prediction = inference_pipeline.predict(["123", "hello world"])
+    if task_type == "multilabel":
+        assert prediction.shape == (2, len(dataset.intents))
+    else:
+        assert prediction.shape == (2,)
+
+    context.dump()
+
+
+@pytest.mark.parametrize(
+    "task_type",
+    ["multiclass", "multilabel", "description"],
+)
+def test_inference_context(dataset, task_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    search_space = get_search_space(task_type)
+
+    pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)
+
+    pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=True))
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True))
+    pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
+
+    context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
+    inference_pipeline = InferencePipeline.from_context(context)
+    prediction = inference_pipeline.predict(["123", "hello world"])
+
+    if task_type == "multilabel":
+        assert prediction.shape == (2, len(dataset.intents))
+    else:
+        assert prediction.shape == (2,)
+
+    context.dump()
+
+
+@pytest.mark.parametrize(
+    "task_type",
+    ["multiclass", "multilabel", "description"],
+)
+def test_inference_pipeline_cli(dataset, task_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    search_space = get_search_space(task_type)
+
+    pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)
+
+    pipeline_optimizer.set_config(
+        logging_config := LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True)
+    )
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cuda", save_db=True))
+    pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
+    context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
+
+    context.dump()
+
+    config = InferenceConfig(
+        data_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"),
+        source_dir=logging_config.dirpath,
+        output_path=logging_config.dump_dir,
+        log_level="CRITICAL",
+    )
+    inference_pipeline(config)
diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py
index 4e22ccf9..d34c2e29 100644
--- a/tests/pipeline/test_optimization.py
+++ b/tests/pipeline/test_optimization.py
@@ -1,4 +1,5 @@
 import importlib.resources as ires
+import os
 from pathlib import Path
 from typing import Literal
 
@@ -47,6 +48,46 @@ def test_no_context_optimization(dataset, task_type):
     context.dump()
 
 
+@pytest.mark.parametrize(
+    "task_type",
+    ["multiclass", "multilabel", "description"],
+)
+def test_save_db(dataset, task_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    search_space = get_search_space(task_type)
+
+    pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)
+
+    pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False))
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), save_db=True, device="cpu"))
+    pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
+
+    context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
+    context.dump()
+
+    assert os.listdir(db_dir)
+
+
+@pytest.mark.parametrize(
+    "task_type",
+    ["multiclass", "multilabel", "description"],
+)
+def test_dump_modules(dataset, task_type):
+    db_dir, dump_dir, logs_dir = setup_environment()
+    search_space = get_search_space(task_type)
+
+    pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)
+
+    pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True))
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu"))
+    pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
+
+    context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
+    context.dump()
+
+    assert os.listdir(dump_dir)
+
+
 @pytest.mark.parametrize(
     "task_type",
     ["multiclass", "multilabel", "description"],

From a2e4deac11442f69cc33d31c135b80aaadf4585d Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Sat, 9 Nov 2024 11:25:45 +0300
Subject: [PATCH 18/21] refactor github actions

---
 .github/workflows/test-inference.yaml         | 40 +++++++++++++++++++
 .../workflows/{tests.yaml => test-nodes.yaml} |  4 +-
 .github/workflows/test-optimization.yaml      | 40 +++++++++++++++++++
 .github/workflows/unit-tests.yaml             | 40 +++++++++++++++++++
 .../optimization/pipeline_optimizer.py        |  1 +
 tests/pipeline/test_inference.py              |  5 ++-
 6 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/test-inference.yaml
 rename .github/workflows/{tests.yaml => test-nodes.yaml} (93%)
 create mode 100644 .github/workflows/test-optimization.yaml
 create mode 100644 .github/workflows/unit-tests.yaml

diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml
new file mode 100644
index 00000000..a0172287
--- /dev/null
+++ b/.github/workflows/test-inference.yaml
@@ -0,0 +1,40 @@
+name: integration tests
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+    branches:
+      - dev
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.10", "3.11", "3.12" ]
+        include:
+          - os: windows-latest
+            python-version: "3.10"
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: "pip"
+
+    - name: Install dependencies
+      run: |
+          pip install .
+          pip install pytest pytest-asyncio
+
+    - name: Run tests
+      run: |
+        pytest tests/pipeline/test_inference.py
diff --git a/.github/workflows/tests.yaml b/.github/workflows/test-nodes.yaml
similarity index 93%
rename from .github/workflows/tests.yaml
rename to .github/workflows/test-nodes.yaml
index ce0a804d..5304057e 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/test-nodes.yaml
@@ -1,4 +1,4 @@
-name: Run Tests
+name: integration tests
 
 on:
   push:
@@ -37,4 +37,4 @@ jobs:
 
     - name: Run tests
       run: |
-        pytest
+        pytest tests/nodes
diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml
new file mode 100644
index 00000000..43a4e6ed
--- /dev/null
+++ b/.github/workflows/test-optimization.yaml
@@ -0,0 +1,40 @@
+name: integration tests
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+    branches:
+      - dev
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.10", "3.11", "3.12" ]
+        include:
+          - os: windows-latest
+            python-version: "3.10"
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: "pip"
+
+    - name: Install dependencies
+      run: |
+          pip install .
+          pip install pytest pytest-asyncio
+
+    - name: Run tests
+      run: |
+        pytest tests/pipeline/test_optimization.py
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
new file mode 100644
index 00000000..3612d561
--- /dev/null
+++ b/.github/workflows/unit-tests.yaml
@@ -0,0 +1,40 @@
+name: unit tests
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+    branches:
+      - dev
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.10", "3.11", "3.12" ]
+        include:
+          - os: windows-latest
+            python-version: "3.10"
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: "pip"
+
+    - name: Install dependencies
+      run: |
+          pip install .
+          pip install pytest pytest-asyncio
+
+    - name: Run tests
+      run: |
+        pytest --ignore=tests/nodes --ignore=tests/pipeline
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index 2b53a99b..e4629aee 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -46,6 +46,7 @@ def optimize(self, context: Context) -> None:
         for node_optimizer in self.nodes:
             node_optimizer.fit(context)
         if not context.vector_index_config.save_db:
+            self._logger.info("removing vector database from file system...")
             context.vector_index_client.delete_db()
 
     def optimize_from_dataset(
diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py
index a04a69c1..273c205b 100644
--- a/tests/pipeline/test_inference.py
+++ b/tests/pipeline/test_inference.py
@@ -53,6 +53,7 @@ def test_inference_config(dataset, task_type):
         assert prediction.shape == (2,)
 
     context.dump()
+    context.vector_index_client.delete_db()
 
 
 @pytest.mark.parametrize(
@@ -79,6 +80,7 @@ def test_inference_context(dataset, task_type):
         assert prediction.shape == (2,)
 
     context.dump()
+    context.vector_index_client.delete_db()
 
 
 @pytest.mark.parametrize(
@@ -94,7 +96,7 @@ def test_inference_pipeline_cli(dataset, task_type):
     pipeline_optimizer.set_config(
         logging_config := LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True)
     )
-    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cuda", save_db=True))
+    pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True))
     pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))
     context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))
 
@@ -107,3 +109,4 @@ def test_inference_pipeline_cli(dataset, task_type):
         log_level="CRITICAL",
     )
     inference_pipeline(config)
+    context.vector_index_client.delete_db()

From c349f18a307e1484c2b64d8b6fc05fb61009d9de Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Sat, 9 Nov 2024 11:27:42 +0300
Subject: [PATCH 19/21] rename actions

---
 .github/workflows/test-inference.yaml    | 2 +-
 .github/workflows/test-nodes.yaml        | 2 +-
 .github/workflows/test-optimization.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml
index a0172287..ca4c45ff 100644
--- a/.github/workflows/test-inference.yaml
+++ b/.github/workflows/test-inference.yaml
@@ -1,4 +1,4 @@
-name: integration tests
+name: test inference
 
 on:
   push:
diff --git a/.github/workflows/test-nodes.yaml b/.github/workflows/test-nodes.yaml
index 5304057e..99507571 100644
--- a/.github/workflows/test-nodes.yaml
+++ b/.github/workflows/test-nodes.yaml
@@ -1,4 +1,4 @@
-name: integration tests
+name: test nodes
 
 on:
   push:
diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml
index 43a4e6ed..ea1cf861 100644
--- a/.github/workflows/test-optimization.yaml
+++ b/.github/workflows/test-optimization.yaml
@@ -1,4 +1,4 @@
-name: integration tests
+name: test optimization
 
 on:
   push:

From b26a878238fdc50aaecd9d948a28220ce64315e1 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Sat, 9 Nov 2024 12:55:02 +0300
Subject: [PATCH 20/21] fix `model_name` issue

---
 .../vector_index_client.py                    |  2 -
 .../datafiles/default-multiclass-config.yaml  |  2 +-
 autointent/modules/base.py                    |  3 +
 autointent/modules/retrieval/vectordb.py      | 14 ++---
 .../scoring/description/description.py        | 24 ++++---
 autointent/modules/scoring/dnnc/dnnc.py       | 20 +++---
 autointent/modules/scoring/knn/knn.py         | 23 ++++---
 autointent/modules/scoring/linear.py          | 24 ++++---
 autointent/modules/scoring/mlknn/mlknn.py     | 23 ++++---
 .../nodes/optimization/node_optimizer.py      |  4 ++
 tests/assets/configs/description.yaml         |  2 +-
 tests/assets/configs/multiclass.yaml          |  4 +-
 tests/assets/configs/multilabel.yaml          |  2 +-
 tests/assets/data/utterances.json             | 62 +++++++++++++++++++
 tests/modules/prediction/test_treshold.py     |  2 +-
 tests/modules/retrieval/test_vectordb.py      |  2 +-
 tests/modules/scoring/test_description.py     |  2 +-
 tests/modules/scoring/test_knn.py             |  2 +-
 tests/modules/scoring/test_mlknn.py           |  2 +-
 tests/modules/test_regex.py                   |  2 +-
 tests/nodes/conftest.py                       |  2 +-
 tests/nodes/test_retrieval.py                 |  2 +-
 tests/nodes/test_scoring.py                   | 14 ++---
 tests/pipeline/test_inference.py              |  4 +-
 24 files changed, 163 insertions(+), 80 deletions(-)
 create mode 100644 tests/assets/data/utterances.json

diff --git a/autointent/context/vector_index_client/vector_index_client.py b/autointent/context/vector_index_client/vector_index_client.py
index 1cb83801..98551eb2 100644
--- a/autointent/context/vector_index_client/vector_index_client.py
+++ b/autointent/context/vector_index_client/vector_index_client.py
@@ -12,8 +12,6 @@
 
 
 class VectorIndexClient:
-    model_name: str
-
     def __init__(
         self,
         device: str,
diff --git a/autointent/datafiles/default-multiclass-config.yaml b/autointent/datafiles/default-multiclass-config.yaml
index ac26d523..34ca64b3 100644
--- a/autointent/datafiles/default-multiclass-config.yaml
+++ b/autointent/datafiles/default-multiclass-config.yaml
@@ -5,7 +5,7 @@ nodes:
     search_space:
       - module_type: vector_db
         k: [10]
-        model_name:
+        embedder_name:
           - avsolatorio/GIST-small-Embedding-v0
           - infgrad/stella-base-en-v2
   - node_type: scoring
diff --git a/autointent/modules/base.py b/autointent/modules/base.py
index 1b41504b..1aaa7eac 100644
--- a/autointent/modules/base.py
+++ b/autointent/modules/base.py
@@ -52,3 +52,6 @@ def predict(self, *args: list[str] | npt.NDArray[Any], **kwargs: dict[str, Any])
     @abstractmethod
     def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> Self:
         pass
+
+    def get_embedder_name(self) -> str | None:
+        return None
diff --git a/autointent/modules/retrieval/vectordb.py b/autointent/modules/retrieval/vectordb.py
index 92d641b9..b3b30313 100644
--- a/autointent/modules/retrieval/vectordb.py
+++ b/autointent/modules/retrieval/vectordb.py
@@ -26,7 +26,7 @@ class VectorDBModule(RetrievalModule):
     def __init__(
         self,
         k: int,
-        model_name: str,
+        embedder_name: str,
         db_dir: str | None = None,
         device: str = "cpu",
         batch_size: int = 32,
@@ -34,7 +34,7 @@ def __init__(
     ) -> None:
         if db_dir is None:
             db_dir = str(get_db_dir())
-        self.model_name = model_name
+        self.embedder_name = embedder_name
         self.device = device
         self.db_dir = db_dir
         self.batch_size = batch_size
@@ -47,11 +47,11 @@ def from_context(
         cls,
         context: Context,
         k: int,
-        model_name: str,
+        embedder_name: str,
     ) -> Self:
         return cls(
             k=k,
-            model_name=model_name,
+            embedder_name=embedder_name,
             db_dir=str(context.get_db_dir()),
             device=context.get_device(),
             batch_size=context.get_batch_size(),
@@ -63,7 +63,7 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
             self.device, self.db_dir, embedder_batch_size=self.batch_size, embedder_max_length=self.max_length
         )
 
-        self.vector_index = vector_index_client.create_index(self.model_name, utterances, labels)
+        self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels)
 
     def score(self, context: Context, metric_fn: RetrievalMetricFn) -> float:
         labels_pred, _, _ = self.vector_index.query(
@@ -73,7 +73,7 @@ def score(self, context: Context, metric_fn: RetrievalMetricFn) -> float:
         return metric_fn(context.data_handler.labels_test, labels_pred)
 
     def get_assets(self) -> RetrieverArtifact:
-        return RetrieverArtifact(embedder_name=self.model_name)
+        return RetrieverArtifact(embedder_name=self.embedder_name)
 
     def clear_cache(self) -> None:
         self.vector_index.delete()
@@ -101,7 +101,7 @@ def load(self, path: str) -> None:
             embedder_batch_size=self.metadata["batch_size"],
             embedder_max_length=self.metadata["max_length"],
         )
-        self.vector_index = vector_index_client.get_index(self.model_name)
+        self.vector_index = vector_index_client.get_index(self.embedder_name)
 
     def predict(self, utterances: list[str]) -> tuple[list[list[int | list[int]]], list[list[float]], list[list[str]]]:
         """
diff --git a/autointent/modules/scoring/description/description.py b/autointent/modules/scoring/description/description.py
index eba80a8d..903a97c8 100644
--- a/autointent/modules/scoring/description/description.py
+++ b/autointent/modules/scoring/description/description.py
@@ -34,7 +34,7 @@ class DescriptionScorer(ScoringModule):
 
     def __init__(
         self,
-        model_name: str,
+        embedder_name: str,
         db_dir: Path | None = None,
         temperature: float = 1.0,
         device: str = "cpu",
@@ -46,7 +46,7 @@ def __init__(
         self.temperature = temperature
         self.device = device
         self.db_dir = db_dir
-        self.model_name = model_name
+        self.embedder_name = embedder_name
         self.batch_size = batch_size
         self.max_length = max_length
 
@@ -55,23 +55,26 @@ def from_context(
         cls,
         context: Context,
         temperature: float,
-        model_name: str | None = None,
+        embedder_name: str | None = None,
     ) -> Self:
-        if model_name is None:
-            model_name = context.optimization_info.get_best_embedder()
+        if embedder_name is None:
+            embedder_name = context.optimization_info.get_best_embedder()
             precomputed_embeddings = True
         else:
-            precomputed_embeddings = context.vector_index_client.exists(model_name)
+            precomputed_embeddings = context.vector_index_client.exists(embedder_name)
 
         instance = cls(
             temperature=temperature,
             device=context.get_device(),
             db_dir=context.get_db_dir(),
-            model_name=model_name,
+            embedder_name=embedder_name,
         )
         instance.precomputed_embeddings = precomputed_embeddings
         return instance
 
+    def get_embedder_name(self) -> str:
+        return self.embedder_name
+
     def fit(
         self,
         utterances: list[str],
@@ -88,7 +91,7 @@ def fit(
         if self.precomputed_embeddings:
             # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization
             vector_index_client = VectorIndexClient(self.device, self.db_dir, self.batch_size, self.max_length)
-            vector_index = vector_index_client.get_index(self.model_name)
+            vector_index = vector_index_client.get_index(self.embedder_name)
             features = vector_index.get_all_embeddings()
             if len(features) != len(utterances):
                 msg = "Vector index mismatches provided utterances"
@@ -96,7 +99,10 @@ def fit(
             embedder = vector_index.embedder
         else:
             embedder = Embedder(
-                device=self.device, model_name=self.model_name, batch_size=self.batch_size, max_length=self.max_length
+                device=self.device,
+                model_name=self.embedder_name,
+                batch_size=self.batch_size,
+                max_length=self.max_length,
             )
             features = embedder.embed(utterances)
 
diff --git a/autointent/modules/scoring/dnnc/dnnc.py b/autointent/modules/scoring/dnnc/dnnc.py
index 950c3247..d6395c0f 100644
--- a/autointent/modules/scoring/dnnc/dnnc.py
+++ b/autointent/modules/scoring/dnnc/dnnc.py
@@ -44,7 +44,7 @@ class DNNCScorer(ScoringModule):
     def __init__(
         self,
         cross_encoder_name: str,
-        search_model_name: str,
+        embedder_name: str,
         k: int,
         db_dir: str | None = None,
         device: str = "cpu",
@@ -56,7 +56,7 @@ def __init__(
             db_dir = str(get_db_dir())
 
         self.cross_encoder_name = cross_encoder_name
-        self.search_model_name = search_model_name
+        self.embedder_name = embedder_name
         self.k = k
         self.train_head = train_head
         self.device = device
@@ -70,18 +70,18 @@ def from_context(
         context: Context,
         cross_encoder_name: str,
         k: int,
-        search_model_name: str | None = None,
+        embedder_name: str | None = None,
         train_head: bool = False,
     ) -> Self:
-        if search_model_name is None:
-            search_model_name = context.optimization_info.get_best_embedder()
+        if embedder_name is None:
+            embedder_name = context.optimization_info.get_best_embedder()
             prebuilt_index = True
         else:
-            prebuilt_index = context.vector_index_client.exists(search_model_name)
+            prebuilt_index = context.vector_index_client.exists(embedder_name)
 
         instance = cls(
             cross_encoder_name=cross_encoder_name,
-            search_model_name=search_model_name,
+            embedder_name=embedder_name,
             k=k,
             train_head=train_head,
             device=context.get_device(),
@@ -101,12 +101,12 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
 
         if self.prebuilt_index:
             # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization
-            self.vector_index = vector_index_client.get_index(self.search_model_name)
+            self.vector_index = vector_index_client.get_index(self.embedder_name)
             if len(utterances) != len(self.vector_index.texts):
                 msg = "Vector index mismatches provided utterances"
                 raise ValueError(msg)
         else:
-            self.vector_index = vector_index_client.create_index(self.search_model_name, utterances, labels)
+            self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels)
 
         if self.train_head:
             model = CrossEncoderWithLogreg(self.model)
@@ -207,7 +207,7 @@ def load(self, path: str) -> None:
             embedder_batch_size=self.metadata["batch_size"],
             embedder_max_length=self.metadata["max_length"],
         )
-        self.vector_index = vector_index_client.get_index(self.search_model_name)
+        self.vector_index = vector_index_client.get_index(self.embedder_name)
 
         crossencoder_dir = str(dump_dir / self.crossencoder_subdir)
         if self.train_head:
diff --git a/autointent/modules/scoring/knn/knn.py b/autointent/modules/scoring/knn/knn.py
index d6fcd236..6d99ce54 100644
--- a/autointent/modules/scoring/knn/knn.py
+++ b/autointent/modules/scoring/knn/knn.py
@@ -31,7 +31,7 @@ class KNNScorer(ScoringModule):
 
     def __init__(
         self,
-        model_name: str,
+        embedder_name: str,
         k: int,
         weights: WEIGHT_TYPES,
         db_dir: str | None = None,
@@ -51,7 +51,7 @@ def __init__(
         """
         if db_dir is None:
             db_dir = str(get_db_dir())
-        self.model_name = model_name
+        self.embedder_name = embedder_name
         self.k = k
         self.weights = weights
         self.db_dir = db_dir
@@ -65,16 +65,16 @@ def from_context(
         context: Context,
         k: int,
         weights: WEIGHT_TYPES,
-        model_name: str | None = None,
+        embedder_name: str | None = None,
     ) -> Self:
-        if model_name is None:
-            model_name = context.optimization_info.get_best_embedder()
+        if embedder_name is None:
+            embedder_name = context.optimization_info.get_best_embedder()
             prebuilt_index = True
         else:
-            prebuilt_index = context.vector_index_client.exists(model_name)
+            prebuilt_index = context.vector_index_client.exists(embedder_name)
 
         instance = cls(
-            model_name=model_name,
+            embedder_name=embedder_name,
             k=k,
             weights=weights,
             db_dir=str(context.get_db_dir()),
@@ -85,6 +85,9 @@ def from_context(
         instance.prebuilt_index = prebuilt_index
         return instance
 
+    def get_embedder_name(self) -> str:
+        return self.embedder_name
+
     def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
         if isinstance(labels[0], list):
             self.n_classes = len(labels[0])
@@ -96,12 +99,12 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
 
         if self.prebuilt_index:
             # this happens only after RetrievalNode optimization
-            self._vector_index = vector_index_client.get_index(self.model_name)
+            self._vector_index = vector_index_client.get_index(self.embedder_name)
             if len(utterances) != len(self._vector_index.texts):
                 msg = "Vector index mismatches provided utterances"
                 raise ValueError(msg)
         else:
-            self._vector_index = vector_index_client.create_index(self.model_name, utterances, labels)
+            self._vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels)
 
     def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
         labels, distances, _ = self._vector_index.query(utterances, self.k)
@@ -141,4 +144,4 @@ def load(self, path: str) -> None:
             embedder_batch_size=self.metadata["batch_size"],
             embedder_max_length=self.metadata["max_length"],
         )
-        self._vector_index = vector_index_client.get_index(self.model_name)
+        self._vector_index = vector_index_client.get_index(self.embedder_name)
diff --git a/autointent/modules/scoring/linear.py b/autointent/modules/scoring/linear.py
index 30364d20..d40420b2 100644
--- a/autointent/modules/scoring/linear.py
+++ b/autointent/modules/scoring/linear.py
@@ -48,7 +48,7 @@ class LinearScorer(ScoringModule):
 
     def __init__(
         self,
-        model_name: str,
+        embedder_name: str,
         cv: int = 3,
         n_jobs: int = -1,
         device: str = "cpu",
@@ -60,7 +60,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.device = device
         self.seed = seed
-        self.model_name = model_name
+        self.embedder_name = embedder_name
         self.batch_size = batch_size
         self.max_length = max_length
 
@@ -68,16 +68,16 @@ def __init__(
     def from_context(
         cls,
         context: Context,
-        model_name: str | None = None,
+        embedder_name: str | None = None,
     ) -> Self:
-        if model_name is None:
-            model_name = context.optimization_info.get_best_embedder()
+        if embedder_name is None:
+            embedder_name = context.optimization_info.get_best_embedder()
             precomputed_embeddings = True
         else:
-            precomputed_embeddings = context.vector_index_client.exists(model_name)
+            precomputed_embeddings = context.vector_index_client.exists(embedder_name)
 
         instance = cls(
-            model_name=model_name,
+            embedder_name=embedder_name,
             device=context.get_device(),
             seed=context.seed,
             batch_size=context.get_batch_size(),
@@ -87,6 +87,9 @@ def from_context(
         instance.db_dir = str(context.get_db_dir())
         return instance
 
+    def get_embedder_name(self) -> str:
+        return self.embedder_name
+
     def fit(
         self,
         utterances: list[str],
@@ -97,7 +100,7 @@ def fit(
         if self.precomputed_embeddings:
             # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization
             vector_index_client = VectorIndexClient(self.device, self.db_dir, self.batch_size, self.max_length)
-            vector_index = vector_index_client.get_index(self.model_name)
+            vector_index = vector_index_client.get_index(self.embedder_name)
             features = vector_index.get_all_embeddings()
             if len(features) != len(utterances):
                 msg = "Vector index mismatches provided utterances"
@@ -105,7 +108,10 @@ def fit(
             embedder = vector_index.embedder
         else:
             embedder = Embedder(
-                device=self.device, model_name=self.model_name, batch_size=self.batch_size, max_length=self.max_length
+                device=self.device,
+                model_name=self.embedder_name,
+                batch_size=self.batch_size,
+                max_length=self.max_length,
             )
             features = embedder.embed(utterances)
 
diff --git a/autointent/modules/scoring/mlknn/mlknn.py b/autointent/modules/scoring/mlknn/mlknn.py
index 495a676c..8b4a3025 100644
--- a/autointent/modules/scoring/mlknn/mlknn.py
+++ b/autointent/modules/scoring/mlknn/mlknn.py
@@ -42,7 +42,7 @@ class MLKnnScorer(ScoringModule):
     def __init__(
         self,
         k: int,
-        model_name: str,
+        embedder_name: str,
         db_dir: str | None = None,
         s: float = 1.0,
         ignore_first_neighbours: int = 0,
@@ -53,7 +53,7 @@ def __init__(
         if db_dir is None:
             db_dir = str(get_db_dir())
         self.k = k
-        self.model_name = model_name
+        self.embedder_name = embedder_name
         self.s = s
         self.ignore_first_neighbours = ignore_first_neighbours
         self.db_dir = db_dir
@@ -68,17 +68,17 @@ def from_context(
         k: int,
         s: float = 1.0,
         ignore_first_neighbours: int = 0,
-        model_name: str | None = None,
+        embedder_name: str | None = None,
     ) -> Self:
-        if model_name is None:
-            model_name = context.optimization_info.get_best_embedder()
+        if embedder_name is None:
+            embedder_name = context.optimization_info.get_best_embedder()
             prebuilt_index = True
         else:
-            prebuilt_index = context.vector_index_client.exists(model_name)
+            prebuilt_index = context.vector_index_client.exists(embedder_name)
 
         instance = cls(
             k=k,
-            model_name=model_name,
+            embedder_name=embedder_name,
             s=s,
             ignore_first_neighbours=ignore_first_neighbours,
             db_dir=str(context.get_db_dir()),
@@ -89,6 +89,9 @@ def from_context(
         instance.prebuilt_index = prebuilt_index
         return instance
 
+    def get_embedder_name(self) -> str:
+        return self.embedder_name
+
     def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
         if not isinstance(labels[0], list):
             msg = "mlknn scorer support only multilabel input"
@@ -100,12 +103,12 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
 
         if self.prebuilt_index:
             # this happens only when LinearScorer is within Pipeline opimization after RetrievalNode optimization
-            self.vector_index = vector_index_client.get_index(self.model_name)
+            self.vector_index = vector_index_client.get_index(self.embedder_name)
             if len(utterances) != len(self.vector_index.texts):
                 msg = "Vector index mismatches provided utterances"
                 raise ValueError(msg)
         else:
-            self.vector_index = vector_index_client.create_index(self.model_name, utterances, labels)
+            self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels)
 
         self.features = (
             self.vector_index.embedder.embed(utterances)
@@ -218,4 +221,4 @@ def load(self, path: str) -> None:
             embedder_batch_size=self.metadata["batch_size"],
             embedder_max_length=self.metadata["max_length"],
         )
-        self.vector_index = vector_index_client.get_index(self.model_name)
+        self.vector_index = vector_index_client.get_index(self.embedder_name)
diff --git a/autointent/nodes/optimization/node_optimizer.py b/autointent/nodes/optimization/node_optimizer.py
index 2a483556..40ad41d3 100644
--- a/autointent/nodes/optimization/node_optimizer.py
+++ b/autointent/nodes/optimization/node_optimizer.py
@@ -39,6 +39,10 @@ def fit(self, context: Context) -> None:
                 self._logger.debug("initializing %s module...", module_type)
                 module = self.node_info.modules_available[module_type].from_context(context, **module_kwargs)
 
+                embedder_name = module.get_embedder_name()
+                if embedder_name is not None:
+                    module_kwargs["embedder_name"] = embedder_name
+
                 self._logger.debug("optimizing %s module...", module_type)
                 self.module_fit(module, context)
 
diff --git a/tests/assets/configs/description.yaml b/tests/assets/configs/description.yaml
index 3cec5470..749b646f 100644
--- a/tests/assets/configs/description.yaml
+++ b/tests/assets/configs/description.yaml
@@ -4,7 +4,7 @@ nodes:
     search_space:
       - module_type: vector_db
         k: [10]
-        model_name:
+        embedder_name:
           - sentence-transformers/all-MiniLM-L6-v2
   - node_type: scoring
     metric: scoring_roc_auc
diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml
index b61f1a62..84dc7539 100644
--- a/tests/assets/configs/multiclass.yaml
+++ b/tests/assets/configs/multiclass.yaml
@@ -4,7 +4,7 @@ nodes:
     search_space:
       - module_type: vector_db
         k: [10]
-        model_name:
+        embedder_name:
           - sentence-transformers/all-MiniLM-L6-v2
           - avsolatorio/GIST-small-Embedding-v0
   - node_type: scoring
@@ -18,8 +18,6 @@ nodes:
         cross_encoder_name:
           - cross-encoder/ms-marco-MiniLM-L-6-v2
           - avsolatorio/GIST-small-Embedding-v0
-        search_model_name:
-          - sergeyzh/rubert-tiny-turbo
         k: [1, 3]
         train_head: [false, true]
   - node_type: prediction
diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml
index 7a0358f8..e9d439da 100644
--- a/tests/assets/configs/multilabel.yaml
+++ b/tests/assets/configs/multilabel.yaml
@@ -4,7 +4,7 @@ nodes:
     search_space:
       - module_type: vector_db
         k: [10]
-        model_name:
+        embedder_name:
           - sentence-transformers/all-MiniLM-L6-v2
           - avsolatorio/GIST-small-Embedding-v0
   - node_type: scoring
diff --git a/tests/assets/data/utterances.json b/tests/assets/data/utterances.json
new file mode 100644
index 00000000..4d22a851
--- /dev/null
+++ b/tests/assets/data/utterances.json
@@ -0,0 +1,62 @@
+[
+    "yes",
+    "can you give me a moderately priced restaurant",
+    "thank you good bye",
+    "vegetarian",
+    "what is the eagles address",
+    "telephone",
+    "what area is it in",
+    "post code",
+    "thank you and good bye",
+    "yes im looking for a traditional restaurant in the expensive price range",
+    "im trying to find a vegetarian restaurant and i dont care regarding the price range",
+    "does it have a television",
+    "hi im looking for a pub",
+    "thank you goodbye",
+    "yes i would like to know about a restaurant",
+    "thank you good bye",
+    "no",
+    "no no",
+    "uh what are some other eareas",
+    "no",
+    "thank you good bye",
+    "end of system audio no im looking for a seafood restaurant",
+    "ok what is the address phone number and price",
+    "yes please",
+    "yeah lets have you got anything in the mediterranean food in the area",
+    "what about any other area",
+    "no particular venue",
+    "whats the uh thank you and goodbye",
+    "next type of food cherry hinton area",
+    "breathing ok thank you goodbye",
+    "can you select me another venue",
+    "next choice",
+    "ok thank you good bye",
+    "ok thank you goodbye",
+    "noise ah hi i am looking for an",
+    "okay and uh",
+    "okay thank you goodbye",
+    "hi im looking for a pub having internet connection and have a tv",
+    "ah hi ah i am looking for a thia restaurant",
+    "hi im looking for a mediterranean restaurant in the rosemary area",
+    "is it in the cheap price range",
+    "is it in the kings hedge area",
+    "hi im looking for a contemporary restaurant and is it should be free",
+    "repeat",
+    "is it located in addenbrookes area",
+    "is it in the city center jesus christ this is ridiculous",
+    "repeat",
+    "say again",
+    "repeat",
+    "repeat",
+    "can we start again",
+    "start over",
+    "not spanish food not spanish food fast",
+    "start over",
+    "uh cheap or moderate",
+    "start over",
+    "id like to start over please",
+    "i need a pub not bakers",
+    "i didnt ask for a moderate price range i need a pub with internet and tv",
+    "not so expensive price range"
+]
\ No newline at end of file
diff --git a/tests/modules/prediction/test_treshold.py b/tests/modules/prediction/test_treshold.py
index 3c602aae..824d3a57 100644
--- a/tests/modules/prediction/test_treshold.py
+++ b/tests/modules/prediction/test_treshold.py
@@ -11,7 +11,7 @@ def get_fit_data(db_dir, dataset):
     knn_params = {
         "k": 3,
         "weights": "distance",
-        "model_name": "sergeyzh/rubert-tiny-turbo",
+        "embedder_name": "sergeyzh/rubert-tiny-turbo",
         "db_dir": db_dir,
     }
     scorer = KNNScorer(**knn_params)
diff --git a/tests/modules/retrieval/test_vectordb.py b/tests/modules/retrieval/test_vectordb.py
index d3002078..2d72753e 100644
--- a/tests/modules/retrieval/test_vectordb.py
+++ b/tests/modules/retrieval/test_vectordb.py
@@ -2,6 +2,6 @@
 
 
 def test_get_assets_returns_correct_artifact(tmp_path):
-    module = VectorDBModule(k=5, model_name="sergeyzh/rubert-tiny-turbo", db_dir=str(tmp_path))
+    module = VectorDBModule(k=5, embedder_name="sergeyzh/rubert-tiny-turbo", db_dir=str(tmp_path))
     artifact = module.get_assets()
     assert artifact.embedder_name == "sergeyzh/rubert-tiny-turbo"
diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py
index 645019b6..c50e0a2e 100644
--- a/tests/modules/scoring/test_description.py
+++ b/tests/modules/scoring/test_description.py
@@ -17,7 +17,7 @@ def test_description_scorer(dataset, expected_prediction, multilabel):
     db_dir, dump_dir, logs_dir = setup_environment()
     data_handler = DataHandler(dataset, force_multilabel=multilabel)
 
-    scorer = DescriptionScorer(model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir, temperature=0.3)
+    scorer = DescriptionScorer(embedder_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir, temperature=0.3)
 
     scorer.fit(data_handler.utterances_train, data_handler.labels_train, data_handler.label_description)
     assert scorer.description_vectors.shape[0] == len(data_handler.label_description)
diff --git a/tests/modules/scoring/test_knn.py b/tests/modules/scoring/test_knn.py
index f2be73ab..735adc1e 100644
--- a/tests/modules/scoring/test_knn.py
+++ b/tests/modules/scoring/test_knn.py
@@ -10,7 +10,7 @@ def test_base_knn(dataset):
 
     data_handler = DataHandler(dataset)
 
-    scorer = KNNScorer(k=3, weights="distance", model_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir)
+    scorer = KNNScorer(k=3, weights="distance", embedder_name="sergeyzh/rubert-tiny-turbo", db_dir=db_dir)
 
     scorer.fit(data_handler.utterances_train, data_handler.labels_train)
     predictions = scorer.predict(
diff --git a/tests/modules/scoring/test_mlknn.py b/tests/modules/scoring/test_mlknn.py
index c76318f1..6cf44386 100644
--- a/tests/modules/scoring/test_mlknn.py
+++ b/tests/modules/scoring/test_mlknn.py
@@ -24,7 +24,7 @@ def test_base_mlknn(dataset):
     )
     data_handler = DataHandler(dataset, test_dataset, force_multilabel=True)
 
-    scorer = MLKnnScorer(db_dir=db_dir, k=3, model_name="sergeyzh/rubert-tiny-turbo")
+    scorer = MLKnnScorer(db_dir=db_dir, k=3, embedder_name="sergeyzh/rubert-tiny-turbo")
     scorer.fit(data_handler.utterances_train, data_handler.labels_train)
 
     predictions = scorer.predict_labels(
diff --git a/tests/modules/test_regex.py b/tests/modules/test_regex.py
index e73501cb..bd834d89 100644
--- a/tests/modules/test_regex.py
+++ b/tests/modules/test_regex.py
@@ -76,7 +76,7 @@ def test_base_regex():
         db_dir=db_dir,
     )
 
-    retrieval_params = {"k": 3, "model_name": "sergeyzh/rubert-tiny-turbo"}
+    retrieval_params = {"k": 3, "embedder_name": "sergeyzh/rubert-tiny-turbo"}
     vector_db = VectorDBModule(**retrieval_params)
     vector_db.fit(context)
     metric_value = vector_db.score(context, retrieval_hit_rate)
diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py
index da221e2e..376d6182 100644
--- a/tests/nodes/conftest.py
+++ b/tests/nodes/conftest.py
@@ -26,7 +26,7 @@ def get_retrieval_optimizer(multilabel: bool):
         "search_space": [
             {
                 "k": [10],
-                "model_name": [
+                "embedder_name": [
                     "sentence-transformers/all-MiniLM-L6-v2",
                 ],
                 "module_type": "vector_db",
diff --git a/tests/nodes/test_retrieval.py b/tests/nodes/test_retrieval.py
index 5d625692..683ca59d 100644
--- a/tests/nodes/test_retrieval.py
+++ b/tests/nodes/test_retrieval.py
@@ -57,7 +57,7 @@ def get_retrieval_optimizer(multilabel: bool):
         "search_space": [
             {
                 "k": [10],
-                "model_name": [
+                "embedder_name": [
                     "sentence-transformers/all-MiniLM-L6-v2",
                     "avsolatorio/GIST-small-Embedding-v0",
                 ],
diff --git a/tests/nodes/test_scoring.py b/tests/nodes/test_scoring.py
index 8d037df2..70eddf91 100644
--- a/tests/nodes/test_scoring.py
+++ b/tests/nodes/test_scoring.py
@@ -22,23 +22,23 @@ def test_scoring_multiclass(context, retrieval_optimizer_multiclass):
                 "module_type": "knn",
                 "k": [3],
                 "weights": ["uniform", "distance", "closest"],
-                "model_name": ["sergeyzh/rubert-tiny-turbo"],
+                "embedder_name": ["sergeyzh/rubert-tiny-turbo"],
             },
             {
                 "module_type": "linear",
-                "model_name": ["sergeyzh/rubert-tiny-turbo"],
+                "embedder_name": ["sergeyzh/rubert-tiny-turbo"],
             },
             {
                 "module_type": "dnnc",
                 "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
-                "search_model_name": ["sergeyzh/rubert-tiny-turbo"],
+                "embedder_name": ["sergeyzh/rubert-tiny-turbo"],
                 "k": [3],
                 "train_head": [False, True],
             },
             {
                 "module_type": "description",
                 "temperature": [1.0, 0.5, 0.1, 0.05],
-                "model_name": ["sergeyzh/rubert-tiny-turbo"],
+                "embedder_name": ["sergeyzh/rubert-tiny-turbo"],
             },
         ],
     }
@@ -73,13 +73,13 @@ def test_scoring_multilabel(context, retrieval_optimizer_multilabel):
                 "module_type": "knn",
                 "weights": ["uniform", "distance", "closest"],
                 "k": [3],
-                "model_name": ["sergeyzh/rubert-tiny-turbo"],
+                "embedder_name": ["sergeyzh/rubert-tiny-turbo"],
             },
             {
                 "module_type": "linear",
-                "model_name": ["sergeyzh/rubert-tiny-turbo"],
+                "embedder_name": ["sergeyzh/rubert-tiny-turbo"],
             },
-            {"module_type": "mlknn", "k": [5], "model_name": ["sergeyzh/rubert-tiny-turbo"]},
+            {"module_type": "mlknn", "k": [5], "embedder_name": ["sergeyzh/rubert-tiny-turbo"]},
         ],
     }
 
diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py
index 273c205b..d7facbea 100644
--- a/tests/pipeline/test_inference.py
+++ b/tests/pipeline/test_inference.py
@@ -103,9 +103,9 @@ def test_inference_pipeline_cli(dataset, task_type):
     context.dump()
 
     config = InferenceConfig(
-        data_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"),
+        data_path=ires.files("tests.assets.data").joinpath("utterances.json"),
         source_dir=logging_config.dirpath,
-        output_path=logging_config.dump_dir,
+        output_path=logging_config.dump_dir / "predictions.json",
         log_level="CRITICAL",
     )
     inference_pipeline(config)

From 7ccbca2e15ae78ac7445a29d7cc8313f96232247 Mon Sep 17 00:00:00 2001
From: voorhs <ilya_alekseev_2016@list.ru>
Date: Mon, 11 Nov 2024 11:35:47 +0300
Subject: [PATCH 21/21] response to review

---
 autointent/context/context.py                          | 6 +++---
 autointent/modules/base.py                             | 1 +
 autointent/pipeline/optimization/cli_endpoint.py       | 6 +++---
 autointent/pipeline/optimization/pipeline_optimizer.py | 4 ++--
 tests/nodes/conftest.py                                | 6 +++---
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/autointent/context/context.py b/autointent/context/context.py
index 4e67a614..2e65305b 100644
--- a/autointent/context/context.py
+++ b/autointent/context/context.py
@@ -32,11 +32,11 @@ def __init__(
         self.seed = seed
         self._logger = logging.getLogger(__name__)
 
-    def config_logs(self, config: LoggingConfig) -> None:
+    def configure_logging(self, config: LoggingConfig) -> None:
         self.logging_config = config
         self.optimization_info = OptimizationInfo()
 
-    def config_vector_index(self, config: VectorIndexConfig, embedder_config: EmbedderConfig | None = None) -> None:
+    def configure_vector_index(self, config: VectorIndexConfig, embedder_config: EmbedderConfig | None = None) -> None:
         self.vector_index_config = config
         if embedder_config is None:
             embedder_config = EmbedderConfig()
@@ -49,7 +49,7 @@ def config_vector_index(self, config: VectorIndexConfig, embedder_config: Embedd
             self.embedder_config.max_length,
         )
 
-    def config_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None:
+    def configure_data(self, config: DataConfig, augmentation_config: AugmentationConfig | None = None) -> None:
         if augmentation_config is not None:
             self.augmentation_config = AugmentationConfig()
             augmenter = DataAugmenter(
diff --git a/autointent/modules/base.py b/autointent/modules/base.py
index 1aaa7eac..e89ef49c 100644
--- a/autointent/modules/base.py
+++ b/autointent/modules/base.py
@@ -54,4 +54,5 @@ def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> Self:
         pass
 
     def get_embedder_name(self) -> str | None:
+        """experimental method"""
         return None
diff --git a/autointent/pipeline/optimization/cli_endpoint.py b/autointent/pipeline/optimization/cli_endpoint.py
index a8bfa975..ef2cd70e 100644
--- a/autointent/pipeline/optimization/cli_endpoint.py
+++ b/autointent/pipeline/optimization/cli_endpoint.py
@@ -19,9 +19,9 @@ def main(cfg: OptimizationConfig) -> None:
 
     # create shared objects for a whole pipeline
     context = Context(cfg.seed)
-    context.config_logs(cfg.logs)
-    context.config_vector_index(cfg.vector_index, cfg.embedder)
-    context.config_data(cfg.data, cfg.augmentation)
+    context.configure_logging(cfg.logs)
+    context.configure_vector_index(cfg.vector_index, cfg.embedder)
+    context.configure_data(cfg.data, cfg.augmentation)
 
     # run optimization
     search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger)
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
index e4629aee..d85c0747 100644
--- a/autointent/pipeline/optimization/pipeline_optimizer.py
+++ b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -54,8 +54,8 @@ def optimize_from_dataset(
     ) -> Context:
         context = Context()
         context.set_datasets(train_data, val_data, force_multilabel)
-        context.config_logs(self.logging_config)
-        context.config_vector_index(self.vector_index_config, self.embedder_config)
+        context.configure_logging(self.logging_config)
+        context.configure_vector_index(self.vector_index_config, self.embedder_config)
 
         self.optimize(context)
         self.inference_config = context.optimization_info.get_inference_nodes_config()
diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py
index 376d6182..9cde1f71 100644
--- a/tests/nodes/conftest.py
+++ b/tests/nodes/conftest.py
@@ -75,9 +75,9 @@ def context(dataset_path):
 
     def _context(multilabel: bool):
         res = Context()
-        res.config_data(DataConfig(dataset_path, force_multilabel=multilabel))
-        res.config_logs(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True))
-        res.config_vector_index(VectorIndexConfig(db_dir=db_dir))
+        res.configure_data(DataConfig(dataset_path, force_multilabel=multilabel))
+        res.configure_logging(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True))
+        res.configure_vector_index(VectorIndexConfig(db_dir=db_dir))
         return res
 
     return _context