Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add synthetic test data integration test utils, and use them for loss value decrease tests. #2789

Merged
merged 3 commits into from
Nov 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions tests/integration_tests/synthetic_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Utilities for producing synthetic test data that is convergence-friendly."""

Comment on lines +1 to +2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we move this into integration_tests/utils.py?. Alternatively, we can rename this to util_synthetic_test_data.py. I'm personally in favor of the former

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is a self-encapsulated set of library functions, I'd actually prefer to keep it in a separate file, also since integration_tests/utils.py is already quite large.

from collections import namedtuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
NUMBER_OBSERVATIONS = 200

GeneratedData = namedtuple("GeneratedData", "train_df validation_df test_df")


def get_feature_configs():
input_features = [
{"name": "x", "type": "number"},
]
output_features = [
{
"name": "y",
"type": "number",
"loss": {"type": "mean_squared_error"},
"decoder": {
"num_fc_layers": 2,
"fc_output_size": 64,
},
}
]

return input_features, output_features


def get_generated_data():
# function generates simple training data that guarantee convergence
# within 30 epochs for suitable config

# generate data
np.random.seed(RANDOM_SEED)
x = np.array(range(NUMBER_OBSERVATIONS)).reshape(-1, 1)
y = 2 * x + 1 + np.random.normal(size=x.shape[0]).reshape(-1, 1)
raw_df = pd.DataFrame(np.concatenate((x, y), axis=1), columns=["x", "y"])

# create training data
train, valid_test = train_test_split(raw_df, train_size=0.7)

# create validation and test data
validation, test = train_test_split(valid_test, train_size=0.5)

return GeneratedData(train, validation, test)


def get_generated_data_for_optimizer():
# function generates simple training data that guarantee convergence
# within 30 epochs for suitable config

# generate data
np.random.seed(RANDOM_SEED)
x = np.array(range(NUMBER_OBSERVATIONS)).reshape(-1, 1)
y = 2 * x + 1 + np.random.normal(size=x.shape[0]).reshape(-1, 1)
raw_df = pd.DataFrame(np.concatenate((x, y), axis=1), columns=["x", "y"])
raw_df["x"] = (raw_df["x"] - raw_df["x"].min()) / (raw_df["x"].max() - raw_df["x"].min())
raw_df["y"] = (raw_df["y"] - raw_df["y"].min()) / (raw_df["y"].max() - raw_df["y"].min())

# create training data
train, valid_test = train_test_split(raw_df, train_size=0.7)

# create validation and test data
validation, test = train_test_split(valid_test, train_size=0.5)

return GeneratedData(train, validation, test)
15 changes: 6 additions & 9 deletions tests/integration_tests/test_gbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from ludwig.api import LudwigModel
from ludwig.constants import INPUT_FEATURES, MODEL_TYPE, OUTPUT_FEATURES, TRAINER
from tests.integration_tests import synthetic_test_data
from tests.integration_tests.utils import binary_feature, category_feature, generate_data, number_feature, text_feature


Expand Down Expand Up @@ -267,23 +268,19 @@ def test_hummingbird_conversion_category(vocab_size, tmpdir, local_backend):


def test_loss_decreases(tmpdir, local_backend):
from ludwig.datasets import get_dataset, model_configs_for_dataset
input_features, output_features = synthetic_test_data.get_feature_configs()

default_config = model_configs_for_dataset("adult_census_income")["default"]
config = {
MODEL_TYPE: "gbm",
"input_features": default_config["input_features"][:3], # only use first 3 features
"output_features": default_config["output_features"],
"input_features": input_features,
"output_features": output_features,
TRAINER: {"num_boost_round": 2, "boosting_rounds_per_checkpoint": 1},
}

df = get_dataset("adult_census_income").load(split=False)
# reduce dataset size to speed up test
df = df.loc[:10, [f["name"] for f in config["input_features"] + config["output_features"]]]

generated_data = synthetic_test_data.get_generated_data_for_optimizer()
model = LudwigModel(config, backend=local_backend)
train_stats, _, _ = model.train(
dataset=df,
dataset=generated_data.train_df,
output_directory=tmpdir,
skip_save_processed_input=True,
skip_save_progress=True,
Expand Down
99 changes: 20 additions & 79 deletions tests/integration_tests/test_model_training_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
import logging
import os.path
import re
from collections import namedtuple

import numpy as np
import pandas as pd
import pytest
import torch
from sklearn.model_selection import train_test_split

from ludwig import globals as global_vars
from ludwig.api import LudwigModel
Expand All @@ -31,80 +29,17 @@
from ludwig.utils.data_utils import load_json, replace_file_extension
from ludwig.utils.misc_utils import get_from_registry
from ludwig.utils.package_utils import LazyLoader
from tests.integration_tests import synthetic_test_data
from tests.integration_tests.utils import category_feature, generate_data, LocalTestBackend

mlflow = LazyLoader("mlflow", globals(), "mlflow")

RANDOM_SEED = 42
NUMBER_OBSERVATIONS = 500

GeneratedData = namedtuple("GeneratedData", "train_df validation_df test_df")


def get_feature_configs():
input_features = [
{"name": "x", "type": "number"},
]
output_features = [
{
"name": "y",
"type": "number",
"loss": {"type": "mean_squared_error"},
"decoder": {
"num_fc_layers": 5,
"fc_output_size": 64,
},
}
]

return input_features, output_features


@pytest.fixture(scope="module")
def generated_data():
# function generates simple training data that guarantee convergence
# within 30 epochs for suitable config

# generate data
np.random.seed(RANDOM_SEED)
x = np.array(range(NUMBER_OBSERVATIONS)).reshape(-1, 1)
y = 2 * x + 1 + np.random.normal(size=x.shape[0]).reshape(-1, 1)
raw_df = pd.DataFrame(np.concatenate((x, y), axis=1), columns=["x", "y"])

# create training data
train, valid_test = train_test_split(raw_df, train_size=0.7)

# create validation and test data
validation, test = train_test_split(valid_test, train_size=0.5)

return GeneratedData(train, validation, test)


@pytest.fixture(scope="module")
def generated_data_for_optimizer():
# function generates simple training data that guarantee convergence
# within 30 epochs for suitable config

# generate data
np.random.seed(RANDOM_SEED)
x = np.array(range(NUMBER_OBSERVATIONS)).reshape(-1, 1)
y = 2 * x + 1 + np.random.normal(size=x.shape[0]).reshape(-1, 1)
raw_df = pd.DataFrame(np.concatenate((x, y), axis=1), columns=["x", "y"])
raw_df["x"] = (raw_df["x"] - raw_df["x"].min()) / (raw_df["x"].max() - raw_df["x"].min())
raw_df["y"] = (raw_df["y"] - raw_df["y"].min()) / (raw_df["y"].max() - raw_df["y"].min())

# create training data
train, valid_test = train_test_split(raw_df, train_size=0.7)

# create validation and test data
validation, test = train_test_split(valid_test, train_size=0.5)

return GeneratedData(train, validation, test)


@pytest.mark.parametrize("early_stop", [3, 5])
def test_early_stopping(early_stop, generated_data, tmp_path):
input_features, output_features = get_feature_configs()
def test_early_stopping(early_stop, tmp_path):
input_features, output_features = synthetic_test_data.get_feature_configs()

config = {
"input_features": input_features,
Expand All @@ -118,6 +53,7 @@ def test_early_stopping(early_stop, generated_data, tmp_path):
results_dir.mkdir()

# run experiment
generated_data = synthetic_test_data.get_generated_data()
_, _, _, _, output_dir = experiment_cli(
training_set=generated_data.train_df,
validation_set=generated_data.validation_df,
Expand Down Expand Up @@ -157,8 +93,8 @@ def test_early_stopping(early_stop, generated_data, tmp_path):

@pytest.mark.parametrize("skip_save_progress", [False])
@pytest.mark.parametrize("skip_save_model", [False, True])
def test_model_progress_save(skip_save_progress, skip_save_model, generated_data, tmp_path):
input_features, output_features = get_feature_configs()
def test_model_progress_save(skip_save_progress, skip_save_model, tmp_path):
input_features, output_features = synthetic_test_data.get_feature_configs()

config = {
"input_features": input_features,
Expand All @@ -172,6 +108,7 @@ def test_model_progress_save(skip_save_progress, skip_save_model, generated_data
results_dir.mkdir()

# run experiment
generated_data = synthetic_test_data.get_generated_data()
_, _, _, _, output_dir = experiment_cli(
training_set=generated_data.train_df,
validation_set=generated_data.validation_df,
Expand Down Expand Up @@ -202,8 +139,8 @@ def test_model_progress_save(skip_save_progress, skip_save_model, generated_data


@pytest.mark.parametrize("optimizer", ["sgd", "adam"])
def test_resume_training(optimizer, generated_data, tmp_path):
input_features, output_features = get_feature_configs()
def test_resume_training(optimizer, tmp_path):
input_features, output_features = synthetic_test_data.get_feature_configs()
config = {
"input_features": input_features,
"output_features": output_features,
Expand All @@ -215,6 +152,7 @@ def test_resume_training(optimizer, generated_data, tmp_path):
results_dir = tmp_path / "results"
results_dir.mkdir()

generated_data = synthetic_test_data.get_generated_data()
_, _, _, _, output_dir1 = experiment_cli(
config,
training_set=generated_data.train_df,
Expand Down Expand Up @@ -255,8 +193,8 @@ def test_resume_training(optimizer, generated_data, tmp_path):


@pytest.mark.parametrize("optimizer", ["sgd", "adam"])
def test_resume_training_mlflow(optimizer, generated_data, tmp_path):
input_features, output_features = get_feature_configs()
def test_resume_training_mlflow(optimizer, tmp_path):
input_features, output_features = synthetic_test_data.get_feature_configs()
config = {
"input_features": input_features,
"output_features": output_features,
Expand All @@ -270,6 +208,7 @@ def test_resume_training_mlflow(optimizer, generated_data, tmp_path):
mlflow_uri = f"file://{tmp_path}/mlruns"
experiment_name = optimizer + "_experiment"

generated_data = synthetic_test_data.get_generated_data()
_, _, _, _, output_dir1 = experiment_cli(
config,
training_set=generated_data.train_df,
Expand Down Expand Up @@ -297,8 +236,8 @@ def test_resume_training_mlflow(optimizer, generated_data, tmp_path):


@pytest.mark.parametrize("optimizer_type", optimizer_registry)
def test_optimizers(optimizer_type, generated_data_for_optimizer, tmp_path):
input_features, output_features = get_feature_configs()
def test_optimizers(optimizer_type, tmp_path):
input_features, output_features = synthetic_test_data.get_feature_configs()

config = {
"input_features": input_features,
Expand All @@ -318,8 +257,9 @@ def test_optimizers(optimizer_type, generated_data_for_optimizer, tmp_path):
results_dir.mkdir()

# run experiment
generated_data = synthetic_test_data.get_generated_data_for_optimizer()
train_stats, preprocessed_data, output_directory = model.train(
training_set=generated_data_for_optimizer.train_df,
training_set=generated_data.train_df,
output_directory=str(results_dir),
config=config,
skip_save_processed_input=True,
Expand All @@ -337,8 +277,8 @@ def test_optimizers(optimizer_type, generated_data_for_optimizer, tmp_path):
assert train_losses[last_entry - 1] <= train_losses[0]


def test_regularization(generated_data, tmp_path):
input_features, output_features = get_feature_configs()
def test_regularization(tmp_path):
input_features, output_features = synthetic_test_data.get_feature_configs()

config = {
"input_features": input_features,
Expand All @@ -356,6 +296,7 @@ def test_regularization(generated_data, tmp_path):
results_dir.mkdir()

regularization_losses = []
generated_data = synthetic_test_data.get_generated_data()
for regularizer in [None, "l1", "l2", "l1_l2"]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
Expand Down