Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GBM: log intermediate progress #2421

Merged
merged 5 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions ludwig/schema/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,15 +320,15 @@ class GBMTrainerConfig(BaseTrainerConfig):

# NOTE: Overwritten here to provide a default value. In many places, we fall back to eval_batch_size if batch_size
# is not specified. GBM does not have a value for batch_size, so we need to specify eval_batch_size here.
eval_batch_size: Union[None, int, str] = schema_utils.OneOfOptionsField(
default=128,
eval_batch_size: Union[None, int, str] = schema_utils.PositiveInteger(
default=1024,
description=("Size of batch to pass to the model for evaluation."),
allow_none=True,
parameter_metadata=TRAINER_METADATA["eval_batch_size"],
field_options=[
schema_utils.PositiveInteger(default=128, description=""),
schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False),
],
)

boosting_round_log_frequency: int = schema_utils.PositiveInteger(
default=10, description="Number of boosting rounds per log of the training progress."
)

# LightGBM core parameters (https://lightgbm.readthedocs.io/en/latest/Parameters.html)
Expand Down
87 changes: 79 additions & 8 deletions ludwig/trainers/trainer_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def iter_feature_metrics(features: LudwigFeatureDict) -> Iterable[Tuple[str, str

@register_trainer("lightgbm_trainer", MODEL_GBM, default=True)
class LightGBMTrainer(BaseTrainer):
TRAIN_KEY = "training"
TRAIN_KEY = "train"
VALID_KEY = "validation"
TEST_KEY = "test"

Expand Down Expand Up @@ -78,6 +78,7 @@ def __init__(
self.boosting_type = config.boosting_type
self.tree_learner = config.tree_learner
self.num_boost_round = config.num_boost_round
self.boosting_round_log_frequency = config.boosting_round_log_frequency
self.max_depth = config.max_depth
self.num_leaves = config.num_leaves
self.min_data_in_leaf = config.min_data_in_leaf
Expand Down Expand Up @@ -246,7 +247,6 @@ def run_evaluation(
tables[COMBINED] = [[COMBINED, LOSS]]

# eval metrics on train
self.eval_batch_size = max(self.eval_batch_size, progress_tracker.batch_size)
if self.evaluate_training_set:
self.evaluation(
training_set, "train", progress_tracker.train_metrics, tables, self.eval_batch_size, progress_tracker
Expand Down Expand Up @@ -322,12 +322,58 @@ def run_evaluation(
# Trigger eval end callback after any model weights save for complete checkpoint
self.callback(lambda c: c.on_eval_end(self, progress_tracker, save_path))

def _train(
def _train_loop(
self,
params: Dict[str, Any],
lgb_train: lgb.Dataset,
eval_sets: List[lgb.Dataset],
eval_names: List[str],
progress_tracker: ProgressTracker,
save_path: str,
) -> lgb.Booster:
name_to_metrics_log = {
LightGBMTrainer.TRAIN_KEY: progress_tracker.train_metrics,
LightGBMTrainer.VALID_KEY: progress_tracker.validation_metrics,
LightGBMTrainer.TEST_KEY: progress_tracker.test_metrics,
}
tables = OrderedDict()
output_features = self.model.output_features
metrics_names = get_metric_names(output_features)
for output_feature_name, output_feature in output_features.items():
tables[output_feature_name] = [[output_feature_name] + metrics_names[output_feature_name]]
tables[COMBINED] = [[COMBINED, LOSS]]
booster = None

for epoch, steps in enumerate(range(0, self.num_boost_round, self.boosting_round_log_frequency), start=1):
progress_tracker.epoch = epoch

evals_result = {}
booster = self.train_step(
params, lgb_train, eval_sets, eval_names, booster, self.boosting_round_log_frequency, evals_result
)

progress_tracker.steps = steps + self.boosting_round_log_frequency
# log training progress
of_name = self.model.output_features.keys()[0]
for data_name in eval_names:
loss_name = params["metric"][0]
loss = evals_result[data_name][loss_name][-1]
metrics = {of_name: {"Survived": {LOSS: loss}}, COMBINED: {LOSS: loss}}
self.append_metrics(data_name, metrics, name_to_metrics_log[data_name], tables, progress_tracker)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are/could we also log these metrics to stdout, similar to how ECD logs intermediate training metrics?

Copy link
Contributor Author

@jppgks jppgks Aug 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, LightGBM is already logging the loss to stdout. This PR just logs the loss, but eventually we might want to log intermediate metrics as well

self.callback(lambda c: c.on_eval_end(self, progress_tracker, save_path))
self.callback(lambda c: c.on_epoch_end(self, progress_tracker, save_path))

return booster

def train_step(
self,
params: Dict[str, Any],
lgb_train: lgb.Dataset,
eval_sets: List[lgb.Dataset],
eval_names: List[str],
booster: lgb.Booster,
steps_per_epoch: int,
evals_result: Dict,
) -> lgb.Booster:
"""Trains a LightGBM model.

Expand All @@ -343,12 +389,14 @@ def _train(
gbm = lgb.train(
params,
lgb_train,
num_boost_round=self.num_boost_round,
init_model=booster,
num_boost_round=steps_per_epoch,
valid_sets=eval_sets,
valid_names=eval_names,
feature_name=list(self.model.input_features.keys()),
# NOTE: hummingbird does not support categorical features
# categorical_feature=categorical_features,
evals_result=evals_result,
callbacks=[
lgb.early_stopping(stopping_rounds=self.early_stop),
lgb.log_evaluation(),
Expand Down Expand Up @@ -386,7 +434,7 @@ def train(

params = self._construct_lgb_params()

lgb_train, eval_sets, eval_names = self._construct_lgb_datasets(training_set, validation_set)
lgb_train, eval_sets, eval_names = self._construct_lgb_datasets(training_set, validation_set, test_set)

# epoch init
start_time = time.time()
Expand All @@ -397,7 +445,7 @@ def train(
self.callback(lambda c: c.on_epoch_start(self, progress_tracker, save_path))
self.callback(lambda c: c.on_batch_start(self, progress_tracker, save_path))

gbm = self._train(params, lgb_train, eval_sets, eval_names)
gbm = self._train_loop(params, lgb_train, eval_sets, eval_names, progress_tracker, save_path)

self.callback(lambda c: c.on_batch_end(self, progress_tracker, save_path))
# ================ Post Training Epoch ================
Expand Down Expand Up @@ -549,6 +597,7 @@ def _construct_lgb_datasets(
self,
training_set: "Dataset", # noqa: F821
validation_set: Optional["Dataset"] = None, # noqa: F821
test_set: Optional["Dataset"] = None, # noqa: F821
) -> Tuple[lgb.Dataset, List[lgb.Dataset], List[str]]:
X_train = training_set.to_df(self.model.input_features.values())
y_train = training_set.to_df(self.model.output_features.values())
Expand All @@ -569,6 +618,13 @@ def _construct_lgb_datasets(
# TODO(joppe): take X% from train set as validation set
pass

if test_set is not None:
X_test = test_set.to_df(self.model.input_features.values())
y_test = test_set.to_df(self.model.output_features.values())
lgb_test = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
eval_sets.append(lgb_test)
eval_names.append(LightGBMTrainer.TEST_KEY)

return lgb_train, eval_sets, eval_names

def _save(self, save_path: str):
Expand Down Expand Up @@ -670,12 +726,15 @@ def __init__(
def get_schema_cls() -> BaseTrainerConfig:
return GBMTrainerConfig

def _train(
def train_step(
self,
params: Dict[str, Any],
lgb_train: "RayDMatrix", # noqa: F821
eval_sets: List["RayDMatrix"], # noqa: F821
eval_names: List[str],
booster: lgb.Booster,
steps_per_epoch: int,
evals_result: Dict,
) -> lgb.Booster:
"""Trains a LightGBM model using ray.

Expand All @@ -693,10 +752,12 @@ def _train(
gbm = lgb_ray_train(
params,
lgb_train,
num_boost_round=self.num_boost_round,
init_model=booster,
num_boost_round=steps_per_epoch,
valid_sets=eval_sets,
valid_names=eval_names,
feature_name=list(self.model.input_features.keys()),
evals_result=evals_result,
# NOTE: hummingbird does not support categorical features
# categorical_feature=categorical_features,
callbacks=[
Expand Down Expand Up @@ -734,6 +795,7 @@ def _construct_lgb_datasets(
self,
training_set: "RayDataset", # noqa: F821
validation_set: Optional["RayDataset"] = None, # noqa: F821
test_set: Optional["RayDataset"] = None, # noqa: F821
) -> Tuple["RayDMatrix", List["RayDMatrix"], List[str]]: # noqa: F821
"""Prepares Ludwig RayDataset objects for use in LightGBM."""

Expand Down Expand Up @@ -762,4 +824,13 @@ def _construct_lgb_datasets(
eval_sets.append(lgb_val)
eval_names.append(LightGBMTrainer.VALID_KEY)

if test_set is not None:
lgb_test = RayDMatrix(
test_set.ds.map_batches(lambda df: df[feat_cols]),
label=label_col,
distributed=False,
)
eval_sets.append(lgb_test)
eval_names.append(LightGBMTrainer.TEST_KEY)

return lgb_train, eval_sets, eval_names