Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-0.6] Cherry-pick upstream commits #2473

Merged
merged 3 commits into from
Sep 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ludwig/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
INFER_IMAGE_MAX_WIDTH = "infer_image_max_width"
INFER_IMAGE_SAMPLE_SIZE = "infer_image_sample_size"
NUM_CHANNELS = "num_channels"
CLASS_WEIGHTS = "class_weights"
LOSS = "loss"
ROC_AUC = "roc_auc"
EVAL_LOSS = "eval_loss"
Expand Down
12 changes: 11 additions & 1 deletion ludwig/data/dataset_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import string
import sys
import uuid
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import numpy as np
import pandas as pd
import torch
import torchaudio
import yaml
Expand All @@ -37,8 +38,10 @@
ENCODER,
H3,
IMAGE,
INPUT_FEATURES,
NAME,
NUMBER,
OUTPUT_FEATURES,
PREPROCESSING,
SEQUENCE,
SET,
Expand Down Expand Up @@ -157,6 +160,13 @@ def build_feature_parameters(features):
}


def build_synthetic_dataset_df(dataset_size: int, config: Dict[str, Any]) -> pd.DataFrame:
features = config[INPUT_FEATURES] + config[OUTPUT_FEATURES]
df = build_synthetic_dataset(dataset_size, features)
data = [next(df) for _ in range(dataset_size + 1)]
return pd.DataFrame(data[1:], columns=data[0])


def build_synthetic_dataset(dataset_size: int, features: List[dict], outdir: str = "."):
"""Synthesizes a dataset for testing purposes.

Expand Down
9 changes: 7 additions & 2 deletions ludwig/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from ludwig.features.feature_registries import base_type_registry
from ludwig.features.feature_utils import compute_feature_hash
from ludwig.utils import data_utils, strings_utils
from ludwig.utils.backward_compatibility import upgrade_metadata
from ludwig.utils.config_utils import merge_config_preprocessing_with_feature_specific_defaults
from ludwig.utils.data_utils import (
CACHEABLE_FORMATS,
Expand Down Expand Up @@ -1498,9 +1499,13 @@ def shuffle(df):
return training_set, test_set, validation_set


def load_metadata(metadata_file_path):
def load_metadata(metadata_file_path: str) -> Dict[str, Any]:
logging.info(f"Loading metadata from: {metadata_file_path}")
return data_utils.load_json(metadata_file_path)
training_set_metadata = data_utils.load_json(metadata_file_path)
# TODO(travis): decouple config from training_set_metadata so we don't need to
# upgrade it over time.
training_set_metadata = upgrade_metadata(training_set_metadata)
return training_set_metadata


def preprocess_for_training(
Expand Down
131 changes: 102 additions & 29 deletions ludwig/utils/backward_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import copy
import warnings
from typing import Any, Callable, Dict, List, Union

from ludwig.constants import (
AUDIO,
BIAS,
CLASS_WEIGHTS,
COLUMN,
CONV_BIAS,
CONV_USE_BIAS,
Expand All @@ -32,6 +33,7 @@
EXECUTOR,
FORCE_SPLIT,
INPUT_FEATURES,
LOSS,
MISSING_VALUE_STRATEGY,
NAME,
NUM_SAMPLES,
Expand All @@ -55,6 +57,7 @@
)
from ludwig.features.feature_registries import base_type_registry
from ludwig.globals import LUDWIG_VERSION
from ludwig.utils.metric_utils import TrainerMetric
from ludwig.utils.misc_utils import merge_dict
from ludwig.utils.version_transformation import VersionTransformation, VersionTransformationRegistry

Expand Down Expand Up @@ -83,7 +86,7 @@ def wrap(fn: Callable[[Dict], Dict]):
return wrap


def upgrade_to_latest_version(config: Dict):
def upgrade_to_latest_version(config: Dict) -> Dict:
"""Updates config from an older version of Ludwig to the current version. If config does not have a
"ludwig_version" key, all updates are applied.

Expand All @@ -98,6 +101,49 @@ def upgrade_to_latest_version(config: Dict):
)


def upgrade_model_progress(model_progress: Dict) -> Dict:
"""Updates model progress info to be compatible with latest ProgressTracker implementation.

Notably, we convert epoch-based stats to their step-based equivalents and reformat metrics into `TrainerMetric`
tuples.
"""
ret = copy.deepcopy(model_progress)

if "last_improvement_epoch" in ret:
ret["last_improvement_steps"] = ret["last_improvement_epoch"] * ret["batch_size"]
del ret["last_improvement_epoch"]

if "last_learning_rate_reduction_epoch" in ret:
ret["last_learning_rate_reduction_steps"] = ret["last_learning_rate_reduction_epoch"] * ret["batch_size"]
del ret["last_learning_rate_reduction_epoch"]

if "last_increase_batch_size_epoch" in ret:
ret["last_increase_batch_size_steps"] = ret["last_increase_batch_size_epoch"] * ret["batch_size"]
del ret["last_increase_batch_size_epoch"]

if "vali_metrics" in ret:
ret["validation_metrics"] = ret["vali_metrics"]
del ret["vali_metrics"]

for metric_group in ("train_metrics", "test_metrics", "validation_metrics"):
for tgt in ret[metric_group]:
for metric in ret[metric_group][tgt]:
if len(ret[metric_group][tgt][metric]) == 0 or isinstance(
ret[metric_group][tgt][metric][0], (tuple, list)
):
continue

ret[metric_group][tgt][metric] = [
TrainerMetric(i + 1, (i + 1) * ret["batch_size"], val)
for i, val in enumerate(ret[metric_group][tgt][metric])
]

if "tune_checkpoint_num" not in ret:
ret["tune_checkpoint_num"] = 0

return ret


def _traverse_dicts(config: Any, f: Callable[[Dict], None]):
"""Recursively applies function f to every dictionary contained in config.

Expand All @@ -112,6 +158,17 @@ def _traverse_dicts(config: Any, f: Callable[[Dict], None]):
_traverse_dicts(v, f)


@register_config_transformation("0.4", ["output_features"])
def update_class_weights_in_features(feature: Dict[str, Any]) -> Dict[str, Any]:
if LOSS in feature:
class_weights = feature[LOSS].get(CLASS_WEIGHTS, None)
if not isinstance(class_weights, list):
class_weights = None
feature[LOSS][CLASS_WEIGHTS] = class_weights

return feature


@register_config_transformation("0.5")
def rename_training_to_trainer(config: Dict[str, Any]) -> Dict[str, Any]:
if TRAINING in config:
Expand Down Expand Up @@ -466,35 +523,51 @@ def update_training(config: Dict[str, Any]) -> Dict[str, Any]:

@register_config_transformation("0.6")
def upgrade_missing_value_strategy(config: Dict[str, Any]) -> Dict[str, Any]:
def __is_old_missing_value_strategy(feature_config: Dict[str, Any]):
if PREPROCESSING not in feature_config:
return False
missing_value_strategy = feature_config.get(PREPROCESSING).get(MISSING_VALUE_STRATEGY, None)
if not missing_value_strategy or missing_value_strategy not in ("backfill", "pad"):
return False
return True

def __update_old_missing_value_strategy(feature_config: Dict[str, Any]):
missing_value_strategy = feature_config.get(PREPROCESSING).get(MISSING_VALUE_STRATEGY)
replacement_strategy = "bfill" if missing_value_strategy == "backfill" else "ffill"
feature_name = feature_config.get(NAME)
warnings.warn(
f"Using `{replacement_strategy}` instead of `{missing_value_strategy}` as the missing value strategy"
f" for `{feature_name}`. These are identical. `{missing_value_strategy}` will be removed in v0.8",
DeprecationWarning,
)
feature_config[PREPROCESSING].update({MISSING_VALUE_STRATEGY: replacement_strategy})

for input_feature in config.get(INPUT_FEATURES, {}):
if __is_old_missing_value_strategy(input_feature):
__update_old_missing_value_strategy(input_feature)
for input_feature in config.get(INPUT_FEATURES, []):
if _is_old_missing_value_strategy(input_feature):
_update_old_missing_value_strategy(input_feature)

for output_feature in config.get(OUTPUT_FEATURES, {}):
if __is_old_missing_value_strategy(output_feature):
__update_old_missing_value_strategy(output_feature)
for output_feature in config.get(OUTPUT_FEATURES, []):
if _is_old_missing_value_strategy(output_feature):
_update_old_missing_value_strategy(output_feature)

for feature, feature_defaults in config.get(DEFAULTS, {}).items():
if __is_old_missing_value_strategy(feature_defaults):
__update_old_missing_value_strategy(config.get(DEFAULTS).get(feature))
if _is_old_missing_value_strategy(feature_defaults):
_update_old_missing_value_strategy(config.get(DEFAULTS).get(feature))

return config


def upgrade_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
# TODO(travis): stopgap solution, we should make it so we don't need to do this
# by decoupling config and metadata
metadata = copy.deepcopy(metadata)
_upgrade_metadata_mising_values(metadata)
return metadata


def _upgrade_metadata_mising_values(metadata: Dict[str, Any]):
for k, v in metadata.items():
if isinstance(v, dict) and _is_old_missing_value_strategy(v):
_update_old_missing_value_strategy(v)


def _update_old_missing_value_strategy(feature_config: Dict[str, Any]):
missing_value_strategy = feature_config.get(PREPROCESSING).get(MISSING_VALUE_STRATEGY)
replacement_strategy = "bfill" if missing_value_strategy == "backfill" else "ffill"
feature_name = feature_config.get(NAME)
warnings.warn(
f"Using `{replacement_strategy}` instead of `{missing_value_strategy}` as the missing value strategy"
f" for `{feature_name}`. These are identical. `{missing_value_strategy}` will be removed in v0.8",
DeprecationWarning,
)
feature_config[PREPROCESSING].update({MISSING_VALUE_STRATEGY: replacement_strategy})


def _is_old_missing_value_strategy(feature_config: Dict[str, Any]):
if PREPROCESSING not in feature_config:
return False
missing_value_strategy = feature_config.get(PREPROCESSING).get(MISSING_VALUE_STRATEGY, None)
if not missing_value_strategy or missing_value_strategy not in ("backfill", "pad"):
return False
return True
7 changes: 7 additions & 0 deletions ludwig/utils/trainer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ def save(self, filepath):
@staticmethod
def load(filepath):
loaded = load_json(filepath)

from ludwig.utils.backward_compatibility import upgrade_model_progress

loaded = upgrade_model_progress(loaded)

return ProgressTracker(**loaded)

def log_metrics(self):
Expand All @@ -142,6 +147,8 @@ def log_metrics(self):
if metrics_tuples:
# For logging, get the latest metrics. The second "-1" indexes into the TrainerMetric
# namedtuple. The last element of the TrainerMetric namedtuple is the actual metric value.
#
# TODO: when loading an existing model, this loses metric values for all but the last epoch.
log_metrics[f"{metrics_dict_name}.{feature_name}.{metric_name}"] = metrics_tuples[-1][-1]

return log_metrics
Expand Down
Loading