deeppavlov · voorhs · Nov 6, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py
@@ -4,6 +4,8 @@
 from numpy.typing import NDArray
 from pydantic import BaseModel, ConfigDict, Field
 
+from autointent.custom_types import NodeType
+
 
 class Artifact(BaseModel): ...
 
@@ -40,7 +42,7 @@ class PredictorArtifact(Artifact):
 
 
 def validate_node_name(value: str) -> str:
-    if value in ["regexp", "retrieval", "scoring", "prediction"]:
+    if value in [NodeType.retrieval, NodeType.scoring, NodeType.prediction, NodeType.regexp]:
         return value
     msg = f"Unknown node_type: {value}. Expected one of ['regexp', 'retrieval', 'scoring', 'prediction']"
     raise ValueError(msg)

diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py
@@ -4,6 +4,7 @@
 from numpy.typing import NDArray
 
 from autointent.configs.node import InferenceNodeConfig
+from autointent.custom_types import NODE_TYPES, NodeType
 from autointent.logger import get_logger
 
 from .data_models import Artifact, Artifacts, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds
@@ -71,32 +72,28 @@ def _get_best_artifact(self, node_type: str) -> RetrieverArtifact | ScorerArtifa
         return self.artifacts.get_best_artifact(node_type, i_best)
 
     def get_best_embedder(self) -> str:
-        best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type="retrieval")  # type: ignore[assignment]
+        best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type=NodeType.retrieval)  # type: ignore[assignment]
         return best_retriever_artifact.embedder_name
 
     def get_best_test_scores(self) -> NDArray[np.float64] | None:
-        best_scorer_artifact: ScorerArtifact = self._get_best_artifact(node_type="scoring")  # type: ignore[assignment]
+        best_scorer_artifact: ScorerArtifact = self._get_best_artifact(node_type=NodeType.scoring)  # type: ignore[assignment]
         return best_scorer_artifact.test_scores
 
     def get_best_oos_scores(self) -> NDArray[np.float64] | None:
-        best_scorer_artifact: ScorerArtifact = self._get_best_artifact(node_type="scoring")  # type: ignore[assignment]
+        best_scorer_artifact: ScorerArtifact = self._get_best_artifact(node_type=NodeType.scoring)  # type: ignore[assignment]
         return best_scorer_artifact.oos_scores
 
     def dump_evaluation_results(self) -> dict[str, dict[str, list[float]]]:
-        node_wise_metrics = {
-            node_type: self._get_metrics_values(node_type)
-            for node_type in ["regexp", "retrieval", "scoring", "prediction"]
-        }
+        node_wise_metrics = {node_type.value: self._get_metrics_values(node_type) for node_type in NODE_TYPES}
         return {
             "metrics": node_wise_metrics,
             "configs": self.trials.model_dump(),
         }
 
     def get_inference_nodes_config(self) -> list[InferenceNodeConfig]:
-        node_types = ["regexp", "retrieval", "scoring", "prediction"]
-        trial_ids = [self._get_best_trial_idx(node_type) for node_type in node_types]
+        trial_ids = [self._get_best_trial_idx(node_type) for node_type in NODE_TYPES]
         res = []
-        for idx, node_type in zip(trial_ids, node_types, strict=True):
+        for idx, node_type in zip(trial_ids, NODE_TYPES, strict=True):
             if idx is None:
                 continue
             trial = self.trials.get_trial(node_type, idx)

diff --git a/autointent/custom_types.py b/autointent/custom_types.py
@@ -19,3 +19,13 @@ class LogLevel(Enum):
 
 class BaseMetadataDict(TypedDict):
     pass
+
+
+class NodeType(str, Enum):
+    retrieval = "retrieval"
+    prediction = "prediction"
+    scoring = "scoring"
+    regexp = "regexp"
+
+
+NODE_TYPES = [NodeType.retrieval, NodeType.prediction, NodeType.scoring, NodeType.regexp]
diff --git a/autointent/generation/prompt_scheme.py b/autointent/generation/prompt_scheme.py
@@ -6,8 +6,8 @@
 class PromptDescription(BaseModel):
     text: str = PROMPT_DESCRIPTION
 
-    @field_validator("text")
     @classmethod
+    @field_validator("text")
     def check_valid_prompt(cls, value: str) -> str:
         if value.find("{intent_name}") == -1 or value.find("{user_utterances}") == -1:
             text_error = (

diff --git a/autointent/modules/__init__.py b/autointent/modules/__init__.py
@@ -1,3 +1,5 @@
+from typing import TypeVar
+
 from .base import Module
 from .prediction import (
     ArgmaxPredictor,
@@ -10,36 +12,31 @@
 from .retrieval import RetrievalModule, VectorDBModule
 from .scoring import DescriptionScorer, DNNCScorer, KNNScorer, LinearScorer, MLKnnScorer, ScoringModule
 
-RETRIEVAL_MODULES_MULTICLASS: dict[str, type[Module]] = {
-    "vector_db": VectorDBModule,
-}
+T = TypeVar("T", bound=Module)
+
+
+def create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
+    return {module.name: module for module in modules}
+
+
+RETRIEVAL_MODULES_MULTICLASS: dict[str, type[Module]] = create_modules_dict([VectorDBModule])
 
 RETRIEVAL_MODULES_MULTILABEL = RETRIEVAL_MODULES_MULTICLASS
 
-SCORING_MODULES_MULTICLASS: dict[str, type[ScoringModule]] = {
-    "dnnc": DNNCScorer,
-    "knn": KNNScorer,
-    "linear": LinearScorer,
-    "description": DescriptionScorer,
-}
+SCORING_MODULES_MULTICLASS: dict[str, type[ScoringModule]] = create_modules_dict(
+    [DNNCScorer, KNNScorer, LinearScorer, DescriptionScorer]
+)
+
+SCORING_MODULES_MULTILABEL: dict[str, type[ScoringModule]] = create_modules_dict(
+    [MLKnnScorer, LinearScorer, DescriptionScorer]
+)
 
-SCORING_MODULES_MULTILABEL: dict[str, type[ScoringModule]] = {
-    "knn": KNNScorer,
-    "linear": LinearScorer,
-    "mlknn": MLKnnScorer,
-}
+PREDICTION_MODULES_MULTICLASS: dict[str, type[Module]] = create_modules_dict(
+    [ArgmaxPredictor, JinoosPredictor, ThresholdPredictor, TunablePredictor]
+)
 
-PREDICTION_MODULES_MULTICLASS: dict[str, type[Module]] = {
-    "argmax": ArgmaxPredictor,
-    "jinoos": JinoosPredictor,
-    "threshold": ThresholdPredictor,
-    "tunable": TunablePredictor,
-}
+PREDICTION_MODULES_MULTILABEL: dict[str, type[Module]] = create_modules_dict([ThresholdPredictor, TunablePredictor])
 
-PREDICTION_MODULES_MULTILABEL: dict[str, type[Module]] = {
-    "threshold": ThresholdPredictor,
-    "tunable": TunablePredictor,
-}
 __all__ = [
     "Module",
     "ArgmaxPredictor",

diff --git a/autointent/modules/base.py b/autointent/modules/base.py
@@ -11,6 +11,8 @@
 
 
 class Module(ABC):
+    name: str
+
     metadata_dict_name: str = "metadata.json"
     metadata: BaseMetadataDict
 

diff --git a/autointent/modules/prediction/argmax.py b/autointent/modules/prediction/argmax.py
@@ -13,10 +13,12 @@
 
 class ArgmaxPredictor(PredictionModule):
     metadata = {}  # noqa: RUF012
+    name = "argmax"
 
     def __init__(self) -> None:
         pass
 
+
     @classmethod
     def from_context(cls, context: Context) -> Self:
         return cls()

diff --git a/autointent/modules/prediction/jinoos.py b/autointent/modules/prediction/jinoos.py
@@ -22,6 +22,7 @@ class JinoosPredictorDumpMetadata(BaseMetadataDict):
 
 class JinoosPredictor(PredictionModule):
     thresh: float
+    name = "jinoos"
 
     def __init__(
         self,

diff --git a/autointent/modules/prediction/threshold.py b/autointent/modules/prediction/threshold.py
@@ -27,6 +27,7 @@ class ThresholdPredictor(PredictionModule):
     metadata: ThresholdPredictorDumpMetadata
     multilabel: bool
     tags: list[Tag] | None
+    name = "threshold"
 
     def __init__(
         self,

diff --git a/autointent/modules/prediction/tunable.py b/autointent/modules/prediction/tunable.py
@@ -24,6 +24,8 @@ class TunablePredictorDumpMetadata(BaseMetadataDict):
 
 
 class TunablePredictor(PredictionModule):
+    name = "tunable"
+
     def __init__(
         self,
         n_trials: int = 320,

diff --git a/autointent/modules/regexp.py b/autointent/modules/regexp.py
@@ -19,6 +19,8 @@ class RegexPatternsCompiled(TypedDict):
 
 
 class RegExp(Module):
+    name = "regexp"
+
     def __init__(self, regexp_patterns: list[RegexPatterns]) -> None:
         self.regexp_patterns = regexp_patterns
 

diff --git a/autointent/modules/retrieval/vectordb.py b/autointent/modules/retrieval/vectordb.py
@@ -21,6 +21,7 @@ class VectorDBMetadata(BaseMetadataDict):
 
 class VectorDBModule(RetrievalModule):
     vector_index: VectorIndex
+    name = "vector_db"
 
     def __init__(
         self,

diff --git a/autointent/modules/scoring/description/description.py b/autointent/modules/scoring/description/description.py
@@ -10,7 +10,7 @@
 
 from autointent.context import Context
 from autointent.context.embedder import Embedder
-from autointent.context.vector_index_client import VectorIndexClient
+from autointent.context.vector_index_client import VectorIndex, VectorIndexClient
 from autointent.context.vector_index_client.cache import get_db_dir
 from autointent.custom_types import LabelType
 from autointent.modules.scoring.base import ScoringModule
@@ -29,6 +29,8 @@ class DescriptionScorer(ScoringModule):
     embedder: Embedder
     precomputed_embeddings: bool = False
     embedding_model_subdir: str = "embedding_model"
+    _vector_index: VectorIndex
+    name = "description"
 
     def __init__(
         self,

diff --git a/autointent/modules/scoring/dnnc/dnnc.py b/autointent/modules/scoring/dnnc/dnnc.py
@@ -35,6 +35,8 @@ class DNNCScorer(ScoringModule):
     - inspect batch size of model.predict?
     """
 
+    name = "dnnc"
+
     crossencoder_subdir: str = "crossencoder"
     model: CrossEncoder | CrossEncoderWithLogreg
     prebuilt_index: bool = False

diff --git a/autointent/modules/scoring/knn/knn.py b/autointent/modules/scoring/knn/knn.py
@@ -26,6 +26,7 @@ class KNNScorerDumpMetadata(BaseMetadataDict):
 class KNNScorer(ScoringModule):
     weights: WEIGHT_TYPES
     _vector_index: VectorIndex
+    name = "knn"
     prebuilt_index: bool = False
 
     def __init__(

diff --git a/autointent/modules/scoring/linear.py b/autointent/modules/scoring/linear.py
@@ -44,6 +44,7 @@ class LinearScorer(ScoringModule):
     embedding_model_subdir: str = "embedding_model"
     precomputed_embeddings: bool = False
     db_dir: str
+    name = "linear"
 
     def __init__(
         self,

diff --git a/autointent/modules/scoring/mlknn/mlknn.py b/autointent/modules/scoring/mlknn/mlknn.py
@@ -37,6 +37,7 @@ class MLKnnScorer(ScoringModule):
     arrays_filename: str = "probs.npz"
     metadata: MLKnnScorerDumpMetadata
     prebuilt_index: bool = False
+    name = "mlknn"
 
     def __init__(
         self,

diff --git a/autointent/nodes/nodes_info/__init__.py b/autointent/nodes/nodes_info/__init__.py
@@ -1,12 +1,14 @@
+from autointent.custom_types import NodeType
+
 from .base import NodeInfo
 from .prediction import PredictionNodeInfo
 from .retrieval import RetrievalNodeInfo
 from .scoring import ScoringNodeInfo
 
 NODES_INFO: dict[str, NodeInfo] = {
-    "retrieval": RetrievalNodeInfo(),
-    "scoring": ScoringNodeInfo(),
-    "prediction": PredictionNodeInfo(),
+    NodeType.retrieval: RetrievalNodeInfo(),
+    NodeType.scoring: ScoringNodeInfo(),
+    NodeType.prediction: PredictionNodeInfo(),
 }
 
 __all__ = ["NodeInfo", "PredictionNodeInfo", "RetrievalNodeInfo", "ScoringNodeInfo", "NODES_INFO"]
diff --git a/autointent/nodes/nodes_info/base.py b/autointent/nodes/nodes_info/base.py
@@ -1,11 +1,12 @@
 from collections.abc import Mapping
 from typing import ClassVar
 
+from autointent.custom_types import NodeType
 from autointent.metrics import METRIC_FN
 from autointent.modules import Module
 
 
 class NodeInfo:
     metrics_available: ClassVar[Mapping[str, METRIC_FN]]
     modules_available: ClassVar[Mapping[str, type[Module]]]
-    node_type: str
+    node_type: NodeType
diff --git a/autointent/nodes/nodes_info/prediction.py b/autointent/nodes/nodes_info/prediction.py
@@ -1,6 +1,7 @@
 from collections.abc import Mapping
 from typing import ClassVar
 
+from autointent.custom_types import NodeType
 from autointent.metrics import PREDICTION_METRICS_MULTICLASS, PREDICTION_METRICS_MULTILABEL, PredictionMetricFn
 from autointent.modules import PREDICTION_MODULES_MULTICLASS, PREDICTION_MODULES_MULTILABEL, Module
 
@@ -14,4 +15,4 @@ class PredictionNodeInfo(NodeInfo):
 
     modules_available: ClassVar[dict[str, type[Module]]] = PREDICTION_MODULES_MULTICLASS | PREDICTION_MODULES_MULTILABEL
 
-    node_type = "prediction"
+    node_type = NodeType.prediction
diff --git a/autointent/nodes/nodes_info/regexp.py b/autointent/nodes/nodes_info/regexp.py
@@ -1,6 +1,7 @@
 from collections.abc import Mapping
 from typing import ClassVar
 
+from autointent.custom_types import NodeType
 from autointent.metrics import regexp_partial_accuracy, regexp_partial_precision
 from autointent.metrics.regexp import RegexpMetricFn
 from autointent.modules import Module, RegExp
@@ -15,6 +16,6 @@ class RegExpNode(NodeInfo):
         regexp_partial_precision,
     )
 
-    modules_available: ClassVar[Mapping[str, type[Module]]] = {"regexp": RegExp}
+    modules_available: ClassVar[Mapping[str, type[Module]]] = {NodeType.regexp: RegExp}
 
-    node_type = "regexp"
+    node_type = NodeType.regexp
diff --git a/autointent/nodes/nodes_info/retrieval.py b/autointent/nodes/nodes_info/retrieval.py
@@ -1,6 +1,7 @@
 from collections.abc import Mapping
 from typing import ClassVar
 
+from autointent.custom_types import NodeType
 from autointent.metrics import (
     RETRIEVAL_METRICS_MULTICLASS,
     RETRIEVAL_METRICS_MULTILABEL,
@@ -20,4 +21,4 @@ class RetrievalNodeInfo(NodeInfo):
         RETRIEVAL_MODULES_MULTICLASS | RETRIEVAL_MODULES_MULTILABEL
     )
 
-    node_type = "retrieval"
+    node_type = NodeType.retrieval
diff --git a/autointent/nodes/nodes_info/scoring.py b/autointent/nodes/nodes_info/scoring.py
@@ -1,6 +1,7 @@
 from collections.abc import Mapping
 from typing import ClassVar
 
+from autointent.custom_types import NodeType
 from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL, ScoringMetricFn
 from autointent.modules import SCORING_MODULES_MULTICLASS, SCORING_MODULES_MULTILABEL, ScoringModule
 
@@ -14,4 +15,4 @@ class ScoringNodeInfo(NodeInfo):
         SCORING_MODULES_MULTICLASS | SCORING_MODULES_MULTILABEL
     )
 
-    node_type = "scoring"
+    node_type = NodeType.scoring
diff --git a/autointent/pipeline/inference/inference_pipeline.py b/autointent/pipeline/inference/inference_pipeline.py
@@ -3,7 +3,7 @@
 from hydra.utils import instantiate
 
 from autointent.configs.inference_pipeline import InferencePipelineConfig
-from autointent.custom_types import LabelType
+from autointent.custom_types import LabelType, NodeType
 from autointent.nodes.inference import InferenceNode
 
 
@@ -16,8 +16,8 @@ def from_dict_config(cls, config: dict[str, Any]) -> "InferencePipeline":
         return instantiate(InferencePipelineConfig, **config)  # type: ignore[no-any-return]
 
     def predict(self, utterances: list[str]) -> list[LabelType]:
-        scores = self.nodes["scoring"].module.predict(utterances)
-        return self.nodes["prediction"].module.predict(scores)  # type: ignore[return-value]
+        scores = self.nodes[NodeType.scoring].module.predict(utterances)
+        return self.nodes[NodeType.prediction].module.predict(scores)  # type: ignore[return-value]
 
     def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
         pass
diff --git a/autointent/pipeline/optimization/pipeline_optimizer.py b/autointent/pipeline/optimization/pipeline_optimizer.py
@@ -9,6 +9,7 @@
 
 from autointent import Context
 from autointent.configs.pipeline_optimizer import PipelineOptimizerConfig
+from autointent.custom_types import NodeType
 from autointent.nodes import NodeOptimizer
 
 from .utils import NumpyEncoder
@@ -70,7 +71,7 @@ def dump(self, logs_dir: str | Path | None) -> None:
             yaml.dump(inference_config, file)
 
 
-def make_report(logs: dict[str, Any], nodes: list[str]) -> str:
+def make_report(logs: dict[str, Any], nodes: list[NodeType]) -> str:
     ids = [np.argmax(logs["metrics"][node]) for node in nodes]
     configs = []
     for i, node in zip(ids, nodes, strict=False):