Skip to content

Commit

Permalink
AutoML libraries that use DatasetProfile instead of DatasetInfo (#2802)
Browse files Browse the repository at this point in the history
  • Loading branch information
justinxzhao authored Dec 5, 2022
1 parent 9f5dd65 commit ac9e556
Show file tree
Hide file tree
Showing 10 changed files with 1,426 additions and 34 deletions.
6 changes: 5 additions & 1 deletion ludwig/automl/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
from ludwig.automl.automl import auto_train, cli_init_config, create_auto_config, train_with_config # noqa
from ludwig.automl.automl import auto_train # noqa
from ludwig.automl.automl import cli_init_config # noqa
from ludwig.automl.automl import create_auto_config # noqa
from ludwig.automl.automl import create_auto_config_with_dataset_profile # noqa
from ludwig.automl.automl import train_with_config # noqa; noqa
93 changes: 81 additions & 12 deletions ludwig/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,17 @@
import yaml

from ludwig.api import LudwigModel
from ludwig.api_annotations import PublicAPI
from ludwig.api_annotations import DeveloperAPI, PublicAPI
from ludwig.automl.auto_tune_config import memory_tune_config
from ludwig.automl.base_config import _create_default_config, _get_reference_configs, DatasetInfo, get_dataset_info
from ludwig.automl.base_config import (
_create_default_config,
_get_reference_configs,
allocate_experiment_resources,
DatasetInfo,
get_dataset_info,
get_default_automl_hyperopt,
get_resource_aware_hyperopt_config,
)
from ludwig.backend import Backend, initialize_backend
from ludwig.constants import (
AUTOML_DEFAULT_IMAGE_ENCODER,
Expand All @@ -30,25 +38,30 @@
ENCODER,
HYPEROPT,
IMAGE,
INPUT_FEATURES,
OUTPUT_FEATURES,
TABULAR,
TEXT,
TYPE,
)
from ludwig.contrib import add_contrib_callback_args
from ludwig.globals import LUDWIG_VERSION
from ludwig.hyperopt.run import hyperopt
from ludwig.utils.automl.ray_utils import _ray_init
from ludwig.utils.automl.utils import (
_add_transfer_config,
get_model_type,
has_imbalanced_output,
set_output_feature_metric,
from ludwig.profiling import dataset_profile_pb2
from ludwig.profiling.dataset_profile import (
get_column_profile_summaries_from_proto,
get_dataset_profile_proto,
get_dataset_profile_view,
)
from ludwig.profiling.type_inference import get_ludwig_type_map_from_column_profile_summaries
from ludwig.utils.automl.ray_utils import _ray_init
from ludwig.utils.automl.utils import _add_transfer_config, get_model_type, set_output_feature_metric
from ludwig.utils.data_utils import load_dataset, use_credentials
from ludwig.utils.defaults import default_random_seed
from ludwig.utils.fs_utils import open_file
from ludwig.utils.misc_utils import merge_dict
from ludwig.utils.print_utils import print_ludwig
from ludwig.utils.types import DataFrame

try:
import dask.dataframe as dd
Expand Down Expand Up @@ -144,6 +157,66 @@ def auto_train(
return train_with_config(dataset, config, output_directory=output_directory, random_seed=random_seed, **kwargs)


@DeveloperAPI
def create_auto_config_with_dataset_profile(
target: str,
dataset: Optional[Union[str, DataFrame]] = None,
dataset_profile: dataset_profile_pb2.DatasetProfile = None,
random_seed: int = default_random_seed,
include_hyperopt: bool = False,
time_limit_s: Union[int, float] = None,
backend: Union[Backend, str] = None,
) -> dict:
"""Returns the best single-shot Ludwig config given a Ludwig dataset or dataset profile.
If only the dataset is provided, then a new profile is computed.
Only one of the dataset or dataset_profile should be specified, not both.
This function is intended to eventually replace create_auto_config().
"""
if dataset is None and dataset_profile is None:
raise ValueError("Please specify either a dataset or a dataset_profile.")
if dataset is not None and dataset_profile is not None:
raise ValueError("Please specify either a dataset or a dataset_profile. It is an error to specify both.")

# Get the dataset profile.
if dataset_profile is None:
dataset_profile = get_dataset_profile_proto(get_dataset_profile_view(dataset))

# Use the dataset profile to get Ludwig types.
ludwig_type_map = get_ludwig_type_map_from_column_profile_summaries(
get_column_profile_summaries_from_proto(dataset_profile)
)

# Add features along with their profiled types.
automl_config = {}
automl_config[INPUT_FEATURES] = []
automl_config[OUTPUT_FEATURES] = []
for feature_name, ludwig_type in ludwig_type_map.items():
if feature_name == target:
automl_config[OUTPUT_FEATURES].append({"name": feature_name, "type": ludwig_type})
else:
automl_config[INPUT_FEATURES].append({"name": feature_name, "type": ludwig_type})

# Set the combiner to tabnet, by default.
automl_config.get("combiner", {})[TYPE] = "tabnet"

# Add hyperopt, if desired.
if include_hyperopt:
automl_config[HYPEROPT] = get_default_automl_hyperopt()

# Merge resource-sensitive settings.
backend = initialize_backend(backend)
resources = backend.get_available_resources()
experiment_resources = allocate_experiment_resources(resources)
automl_config = merge_dict(
automl_config, get_resource_aware_hyperopt_config(experiment_resources, time_limit_s, random_seed)
)

# TODO: Adjust preprocessing parameters according to output feature imbalance.
return automl_config


@PublicAPI
def create_auto_config(
dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
Expand Down Expand Up @@ -325,10 +398,6 @@ def _model_select(
if param in user_config[config_section]:
del base_config["hyperopt"]["parameters"][hyperopt_params]

# check if any binary or category output feature has highly imbalanced minority vs majority values
# note: check is done after any relevant user_config has been applied
has_imbalanced_output(base_config, features_metadata)

# if single output feature, set relevant metric and goal if not already set
base_config = set_output_feature_metric(base_config)

Expand Down
45 changes: 43 additions & 2 deletions ludwig/automl/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import dask.dataframe as dd
import numpy as np
import pandas as pd
import yaml
from dataclasses_json import dataclass_json, LetterCase

from ludwig.api_annotations import DeveloperAPI
Expand All @@ -34,6 +35,8 @@
TEXT,
TYPE,
)
from ludwig.profiling import dataset_profile_pb2
from ludwig.profiling.dataset_profile import get_dataset_profile_proto, get_dataset_profile_view
from ludwig.utils.automl.data_source import DataSource, wrap_data_source
from ludwig.utils.automl.field_info import FieldConfig, FieldInfo, FieldMetadata
from ludwig.utils.automl.type_inference import infer_type, should_exclude
Expand Down Expand Up @@ -93,7 +96,13 @@ def allocate_experiment_resources(resources: Resources) -> dict:
return experiment_resources


def _get_hyperopt_config(experiment_resources: Dict[str, Any], time_limit_s: Union[int, float], random_seed: int):
def get_resource_aware_hyperopt_config(
experiment_resources: Dict[str, Any], time_limit_s: Union[int, float], random_seed: int
) -> Dict[str, Any]:
"""Returns a Ludwig config with the hyperopt section populated with appropriate parameters.
Hyperopt parameters are intended to be appropriate for the given resources and time limit.
"""
executor = experiment_resources
executor.update({"time_budget_s": time_limit_s})
if time_limit_s is not None:
Expand All @@ -118,6 +127,33 @@ def _get_stratify_split_config(field_meta: FieldMetadata) -> dict:
}


def get_default_automl_hyperopt() -> Dict[str, Any]:
"""Returns general, default settings for hyperopt.
For example:
- We set a random_state_seed for sample sequence repeatability
- We use an increased reduction_factor to get more pruning/exploration.
TODO: If settings seem reasonable, consider building this into the hyperopt schema, directly.
"""
return yaml.safe_load(
"""
search_alg:
type: hyperopt
executor:
type: ray
num_samples: 10
time_budget_s: 3600
scheduler:
type: async_hyperband
time_attr: time_total_s
max_t: 3600
grace_period: 72
reduction_factor: 5
"""
)


def _create_default_config(
dataset_info: DatasetInfo,
target_name: Union[str, List[str]],
Expand Down Expand Up @@ -170,7 +206,7 @@ def _create_default_config(
# update hyperopt config
experiment_resources = allocate_experiment_resources(resources)
base_automl_config = merge_dict(
base_automl_config, _get_hyperopt_config(experiment_resources, time_limit_s, random_seed)
base_automl_config, get_resource_aware_hyperopt_config(experiment_resources, time_limit_s, random_seed)
)

# add preprocessing section if single output feature is imbalanced
Expand Down Expand Up @@ -243,6 +279,11 @@ def is_field_boolean(source: DataSource, field: str) -> bool:
return False


@DeveloperAPI
def get_dataset_profile_from_source(source: DataSource) -> dataset_profile_pb2.DatasetProfile:
return get_dataset_profile_proto(get_dataset_profile_view(source.df))


@DeveloperAPI
def get_dataset_info_from_source(source: DataSource) -> DatasetInfo:
"""Constructs FieldInfo objects for each feature in dataset. These objects are used for downstream type
Expand Down
166 changes: 166 additions & 0 deletions ludwig/profiling/proto/dataset_profile.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit ac9e556

Please sign in to comment.