AutoML libraries that use DatasetProfile instead of DatasetInfo (#2802)

ludwig-ai · Dec 5, 2022 · ac9e556 · ac9e556
1 parent 9f5dd65
commit ac9e556
Show file tree

Hide file tree

Showing 10 changed files with 1,426 additions and 34 deletions.
diff --git a/ludwig/automl/__init__.py b/ludwig/automl/__init__.py
@@ -1 +1,5 @@
-from ludwig.automl.automl import auto_train, cli_init_config, create_auto_config, train_with_config  # noqa
+from ludwig.automl.automl import auto_train  # noqa
+from ludwig.automl.automl import cli_init_config  # noqa
+from ludwig.automl.automl import create_auto_config  # noqa
+from ludwig.automl.automl import create_auto_config_with_dataset_profile  # noqa
+from ludwig.automl.automl import train_with_config  # noqa; noqa
diff --git a/ludwig/automl/automl.py b/ludwig/automl/automl.py
@@ -19,9 +19,17 @@
 import yaml
 
 from ludwig.api import LudwigModel
-from ludwig.api_annotations import PublicAPI
+from ludwig.api_annotations import DeveloperAPI, PublicAPI
 from ludwig.automl.auto_tune_config import memory_tune_config
-from ludwig.automl.base_config import _create_default_config, _get_reference_configs, DatasetInfo, get_dataset_info
+from ludwig.automl.base_config import (
+    _create_default_config,
+    _get_reference_configs,
+    allocate_experiment_resources,
+    DatasetInfo,
+    get_dataset_info,
+    get_default_automl_hyperopt,
+    get_resource_aware_hyperopt_config,
+)
 from ludwig.backend import Backend, initialize_backend
 from ludwig.constants import (
     AUTOML_DEFAULT_IMAGE_ENCODER,
@@ -30,25 +38,30 @@
     ENCODER,
     HYPEROPT,
     IMAGE,
+    INPUT_FEATURES,
+    OUTPUT_FEATURES,
     TABULAR,
     TEXT,
     TYPE,
 )
 from ludwig.contrib import add_contrib_callback_args
 from ludwig.globals import LUDWIG_VERSION
 from ludwig.hyperopt.run import hyperopt
-from ludwig.utils.automl.ray_utils import _ray_init
-from ludwig.utils.automl.utils import (
-    _add_transfer_config,
-    get_model_type,
-    has_imbalanced_output,
-    set_output_feature_metric,
+from ludwig.profiling import dataset_profile_pb2
+from ludwig.profiling.dataset_profile import (
+    get_column_profile_summaries_from_proto,
+    get_dataset_profile_proto,
+    get_dataset_profile_view,
 )
+from ludwig.profiling.type_inference import get_ludwig_type_map_from_column_profile_summaries
+from ludwig.utils.automl.ray_utils import _ray_init
+from ludwig.utils.automl.utils import _add_transfer_config, get_model_type, set_output_feature_metric
 from ludwig.utils.data_utils import load_dataset, use_credentials
 from ludwig.utils.defaults import default_random_seed
 from ludwig.utils.fs_utils import open_file
 from ludwig.utils.misc_utils import merge_dict
 from ludwig.utils.print_utils import print_ludwig
+from ludwig.utils.types import DataFrame
 
 try:
     import dask.dataframe as dd
@@ -144,6 +157,66 @@ def auto_train(
     return train_with_config(dataset, config, output_directory=output_directory, random_seed=random_seed, **kwargs)
 
 
+@DeveloperAPI
+def create_auto_config_with_dataset_profile(
+    target: str,
+    dataset: Optional[Union[str, DataFrame]] = None,
+    dataset_profile: dataset_profile_pb2.DatasetProfile = None,
+    random_seed: int = default_random_seed,
+    include_hyperopt: bool = False,
+    time_limit_s: Union[int, float] = None,
+    backend: Union[Backend, str] = None,
+) -> dict:
+    """Returns the best single-shot Ludwig config given a Ludwig dataset or dataset profile.
+
+    If only the dataset is provided, then a new profile is computed.
+    Only one of the dataset or dataset_profile should be specified, not both.
+
+    This function is intended to eventually replace create_auto_config().
+    """
+    if dataset is None and dataset_profile is None:
+        raise ValueError("Please specify either a dataset or a dataset_profile.")
+    if dataset is not None and dataset_profile is not None:
+        raise ValueError("Please specify either a dataset or a dataset_profile. It is an error to specify both.")
+
+    # Get the dataset profile.
+    if dataset_profile is None:
+        dataset_profile = get_dataset_profile_proto(get_dataset_profile_view(dataset))
+
+    # Use the dataset profile to get Ludwig types.
+    ludwig_type_map = get_ludwig_type_map_from_column_profile_summaries(
+        get_column_profile_summaries_from_proto(dataset_profile)
+    )
+
+    # Add features along with their profiled types.
+    automl_config = {}
+    automl_config[INPUT_FEATURES] = []
+    automl_config[OUTPUT_FEATURES] = []
+    for feature_name, ludwig_type in ludwig_type_map.items():
+        if feature_name == target:
+            automl_config[OUTPUT_FEATURES].append({"name": feature_name, "type": ludwig_type})
+        else:
+            automl_config[INPUT_FEATURES].append({"name": feature_name, "type": ludwig_type})
+
+    # Set the combiner to tabnet, by default.
+    automl_config.get("combiner", {})[TYPE] = "tabnet"
+
+    # Add hyperopt, if desired.
+    if include_hyperopt:
+        automl_config[HYPEROPT] = get_default_automl_hyperopt()
+
+        # Merge resource-sensitive settings.
+        backend = initialize_backend(backend)
+        resources = backend.get_available_resources()
+        experiment_resources = allocate_experiment_resources(resources)
+        automl_config = merge_dict(
+            automl_config, get_resource_aware_hyperopt_config(experiment_resources, time_limit_s, random_seed)
+        )
+
+    # TODO: Adjust preprocessing parameters according to output feature imbalance.
+    return automl_config
+
+
 @PublicAPI
 def create_auto_config(
     dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
@@ -325,10 +398,6 @@ def _model_select(
                 if param in user_config[config_section]:
                     del base_config["hyperopt"]["parameters"][hyperopt_params]
 
-    # check if any binary or category output feature has highly imbalanced minority vs majority values
-    # note: check is done after any relevant user_config has been applied
-    has_imbalanced_output(base_config, features_metadata)
-
     # if single output feature, set relevant metric and goal if not already set
     base_config = set_output_feature_metric(base_config)
 

diff --git a/ludwig/automl/base_config.py b/ludwig/automl/base_config.py
@@ -18,6 +18,7 @@
 import dask.dataframe as dd
 import numpy as np
 import pandas as pd
+import yaml
 from dataclasses_json import dataclass_json, LetterCase
 
 from ludwig.api_annotations import DeveloperAPI
@@ -34,6 +35,8 @@
     TEXT,
     TYPE,
 )
+from ludwig.profiling import dataset_profile_pb2
+from ludwig.profiling.dataset_profile import get_dataset_profile_proto, get_dataset_profile_view
 from ludwig.utils.automl.data_source import DataSource, wrap_data_source
 from ludwig.utils.automl.field_info import FieldConfig, FieldInfo, FieldMetadata
 from ludwig.utils.automl.type_inference import infer_type, should_exclude
@@ -93,7 +96,13 @@ def allocate_experiment_resources(resources: Resources) -> dict:
     return experiment_resources
 
 
-def _get_hyperopt_config(experiment_resources: Dict[str, Any], time_limit_s: Union[int, float], random_seed: int):
+def get_resource_aware_hyperopt_config(
+    experiment_resources: Dict[str, Any], time_limit_s: Union[int, float], random_seed: int
+) -> Dict[str, Any]:
+    """Returns a Ludwig config with the hyperopt section populated with appropriate parameters.
+
+    Hyperopt parameters are intended to be appropriate for the given resources and time limit.
+    """
     executor = experiment_resources
     executor.update({"time_budget_s": time_limit_s})
     if time_limit_s is not None:
@@ -118,6 +127,33 @@ def _get_stratify_split_config(field_meta: FieldMetadata) -> dict:
     }
 
 
+def get_default_automl_hyperopt() -> Dict[str, Any]:
+    """Returns general, default settings for hyperopt.
+
+    For example:
+    - We set a random_state_seed for sample sequence repeatability
+    - We use an increased reduction_factor to get more pruning/exploration.
+
+    TODO: If settings seem reasonable, consider building this into the hyperopt schema, directly.
+    """
+    return yaml.safe_load(
+        """
+  search_alg:
+    type: hyperopt
+  executor:
+    type: ray
+    num_samples: 10
+    time_budget_s: 3600
+    scheduler:
+      type: async_hyperband
+      time_attr: time_total_s
+      max_t: 3600
+      grace_period: 72
+      reduction_factor: 5
+"""
+    )
+
+
 def _create_default_config(
     dataset_info: DatasetInfo,
     target_name: Union[str, List[str]],
@@ -170,7 +206,7 @@ def _create_default_config(
     # update hyperopt config
     experiment_resources = allocate_experiment_resources(resources)
     base_automl_config = merge_dict(
-        base_automl_config, _get_hyperopt_config(experiment_resources, time_limit_s, random_seed)
+        base_automl_config, get_resource_aware_hyperopt_config(experiment_resources, time_limit_s, random_seed)
     )
 
     # add preprocessing section if single output feature is imbalanced
@@ -243,6 +279,11 @@ def is_field_boolean(source: DataSource, field: str) -> bool:
     return False
 
 
+@DeveloperAPI
+def get_dataset_profile_from_source(source: DataSource) -> dataset_profile_pb2.DatasetProfile:
+    return get_dataset_profile_proto(get_dataset_profile_view(source.df))
+
+
 @DeveloperAPI
 def get_dataset_info_from_source(source: DataSource) -> DatasetInfo:
     """Constructs FieldInfo objects for each feature in dataset. These objects are used for downstream type

diff --git a/ludwig/profiling/proto/dataset_profile.pb.go b/ludwig/profiling/proto/dataset_profile.pb.go