Skip to content

Commit

Permalink
Add identical markers to identify fit/inferencetime/predict stages (#548
Browse files Browse the repository at this point in the history
)
  • Loading branch information
PGijsbers authored Jun 22, 2023
1 parent 363aedb commit 3542b07
Show file tree
Hide file tree
Showing 13 changed files with 54 additions and 6 deletions.
4 changes: 4 additions & 0 deletions frameworks/AutoGluon/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def run(dataset, config):
**training_params
)

log.info(f"Finished fit in {training.duration}s.")

# Persist model in memory that is going to be predicting to get correct inference latency
# max_memory=0.4 will be future default: https://github.com/autogluon/autogluon/pull/3338
predictor.persist_models('best', max_memory=0.4)
Expand All @@ -100,6 +102,7 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
infer,
[(1, test_data.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")

test_data = TabularDataset(test_path)
with Timer() as predict:
Expand All @@ -108,6 +111,7 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
predictions = probabilities.idxmax(axis=1).to_numpy()

prob_labels = probabilities.columns.values.astype(str).tolist() if probabilities is not None else None
log.info(f"Finished predict in {predict.duration}s.")

_leaderboard_extra_info = config.framework_params.get('_leaderboard_extra_info', False) # whether to get extra model info (very verbose)
_leaderboard_test = config.framework_params.get('_leaderboard_test', False) # whether to compute test scores in leaderboard (expensive)
Expand Down
3 changes: 3 additions & 0 deletions frameworks/DecisionTree/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@ def run(dataset: Dataset, config: TaskConfig):

with Timer() as training:
predictor.fit(X_train, y_train)
log.info(f"Finished fit in {training.duration}s.")

with Timer() as predict:
predictions = predictor.predict(X_test)
probabilities = predictor.predict_proba(X_test) if is_classification else None
log.info(f"Finished predict in {predict.duration}s.")

save_predictions(dataset=dataset,
output_file=config.output_predictions_file,
Expand Down
14 changes: 10 additions & 4 deletions frameworks/GAMA/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,10 @@ def run(dataset, config):
gama_automl = estimator(**kwargs)

X_train, y_train = dataset.train.X, dataset.train.y
with Timer() as training_timer:
with Timer() as training:
gama_automl.fit(X_train, y_train)
log.info(f"Finished fit in {training.duration}s.")


log.info('Predicting on the test set.')
def infer(data: Union[str, pd.DataFrame]):
Expand All @@ -92,9 +94,13 @@ def infer(data: Union[str, pd.DataFrame]):
infer,
[(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
)
with Timer() as predict_timer:
log.info(f"Finished inference time measurements.")

with Timer() as predict:
X_test, y_test = dataset.test.X, dataset.test.y
predictions = gama_automl.predict(X_test)
log.info(f"Finished predict in {predict.duration}s.")


probabilities = None
if is_classification:
Expand All @@ -107,8 +113,8 @@ def infer(data: Union[str, pd.DataFrame]):
truth=y_test,
target_is_encoded=False,
models_count=len(gama_automl._final_pop),
training_duration=training_timer.duration,
predict_duration=predict_timer.duration,
training_duration=training.duration,
predict_duration=predict.duration,
inference_times=inference_times,
)

Expand Down
4 changes: 4 additions & 0 deletions frameworks/H2OAutoML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ def run(dataset, config):
with Timer() as training:
with monitor:
aml.train(y=dataset.target.index, training_frame=train)
log.info(f"Finished fit in {training.duration}s.")


if not aml.leader:
raise FrameworkError("H2O could not produce any model in the requested time.")
Expand All @@ -128,9 +130,11 @@ def infer(path: str):
inference_times = {}
if config.measure_inference_time:
inference_times["file"] = measure_inference_times(infer, dataset.inference_subsample_files)
log.info(f"Finished inference time measurements.")

with Timer() as predict:
preds = aml.predict(test)
log.info(f"Finished predict in {predict.duration}s.")

preds = extract_preds(preds, test, dataset=dataset)
save_artifacts(aml, dataset=dataset, config=config)
Expand Down
6 changes: 4 additions & 2 deletions frameworks/MLNet/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def run(dataset: Dataset, config: TaskConfig):

with Timer() as training:
run_cmd(cmd)
log.info(f"Finished fit in {training.duration}s.")

train_result_json = os.path.join(output_dir, '{}.mbconfig'.format(config.fold))
if not os.path.exists(train_result_json):
Expand All @@ -75,8 +76,9 @@ def run(dataset: Dataset, config: TaskConfig):
# predict
predict_cmd = (f"{mlnet} predict --task-type {config.type}"
f" --model {model_path} --dataset {test_dataset_path} --label-col {dataset.target.name} > {output_prediction_path}")
with Timer() as prediction:
with Timer() as predict:
run_cmd(predict_cmd)
log.info(f"Finished predict in {predict.duration}s.")
if config.type == 'classification':
prediction_df = pd.read_csv(output_prediction_path, dtype={'PredictedLabel': 'object'})

Expand All @@ -101,7 +103,7 @@ def run(dataset: Dataset, config: TaskConfig):
return dict(
models_count=models_count,
training_duration=training.duration,
predict_duration=prediction.duration,
predict_duration=predict.duration,
)
finally:
if 'logs' in artifacts:
Expand Down
1 change: 1 addition & 0 deletions frameworks/MLPlan/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def run(dataset, config):

with Timer() as training:
run_cmd(cmd, _live_output_=True)
log.info(f"Finished fit in {training.duration}s.")

with open(statistics_file, 'r') as f:
stats = json.load(f)
Expand Down
5 changes: 5 additions & 0 deletions frameworks/RandomForest/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,13 @@ def run(dataset, config):
else:
# https://stackoverflow.com/questions/42757892/how-to-use-warm-start/42763502
rf.n_estimators += step_size
log.info(f"Finished fit in {training.duration}s.")


with Timer() as predict:
predictions = rf.predict(X_test)
probabilities = rf.predict_proba(X_test) if is_classification else None
log.info(f"Finished predict in {predict.duration}s.")

def infer(data):
data = pd.read_parquet(data) if isinstance(data, str) else data
Expand All @@ -100,6 +103,8 @@ def infer(data):
infer,
[(1, test_data.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")


return result(output_file=config.output_predictions_file,
predictions=predictions,
Expand Down
4 changes: 4 additions & 0 deletions frameworks/TPOT/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def run(dataset, config):

with Timer() as training:
tpot.fit(X_train, y_train)
log.info(f"Finished fit in {training.duration}s.")


def infer(data):
data = pd.read_parquet(data) if isinstance(data, str) else data
Expand All @@ -85,6 +87,7 @@ def infer(data):
for i in range(100)
],
)
log.info(f"Finished inference time measurements.")

log.info('Predicting on the test set.')
y_test = dataset.test.y
Expand All @@ -99,6 +102,7 @@ def infer(data):
# does not support `predict_proba` (which one depends on the version).
probabilities = "predictions" # encoding is handled by caller in `__init__.py`

log.info(f"Finished predict in {predict.duration}s.")
save_artifacts(tpot, config)

return result(output_file=config.output_predictions_file,
Expand Down
3 changes: 3 additions & 0 deletions frameworks/autosklearn/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def run(dataset, config):
auto_sklearn = estimator(**constr_params, **training_params)
with Timer() as training:
auto_sklearn.fit(X_train, y_train, **fit_extra_params)
log.info(f"Finished fit in {training.duration}s.")

def infer(data: Union[str, pd.DataFrame]):
test_data = pd.read_parquet(data) if isinstance(data, str) else data
Expand All @@ -157,13 +158,15 @@ def sample_one_test_row(seed: int):
inference_times["df"] = measure_inference_times(
infer, [(1, sample_one_test_row(seed=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")

# Convert output to strings for classification
log.info("Predicting on the test set.")
with Timer() as predict:
X_test = dataset.test.X if use_pandas else dataset.test.X_enc
predictions = auto_sklearn.predict(X_test)
probabilities = auto_sklearn.predict_proba(X_test) if is_classification else None
log.info(f"Finished predict in {predict.duration}s.")

save_artifacts(auto_sklearn, config)

Expand Down
4 changes: 4 additions & 0 deletions frameworks/constantpredictor/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ def run(dataset: Dataset, config: TaskConfig):

with Timer() as training:
predictor.fit(X_train, y_train)
log.info(f"Finished fit in {training.duration}s.")

with Timer() as predict:
predictions = predictor.predict(X_test)
probabilities = predictor.predict_proba(X_test) if is_classification else None
log.info(f"Finished predict in {predict.duration}s.")

def infer(data):
data = pd.read_parquet(data) if isinstance(data, str) else data
Expand All @@ -43,6 +46,7 @@ def infer(data):
infer,
[(1, test_data.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")

save_predictions(dataset=dataset,
output_file=config.output_predictions_file,
Expand Down
4 changes: 4 additions & 0 deletions frameworks/flaml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def run(dataset, config):
n_jobs=n_jobs,
log_file_name= flaml_log_file_name,
time_budget=time_budget, **training_params)
log.info(f"Finished fit in {training.duration}s.")

def infer(data: Union[str, pd.DataFrame]):
data = pd.read_parquet(data) if isinstance(data, str) else data
Expand All @@ -64,6 +65,7 @@ def infer(data: Union[str, pd.DataFrame]):
infer,
[(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")

with Timer() as predict:
X_test, y_test = dataset.test.X, dataset.test.y.squeeze()
Expand All @@ -72,6 +74,8 @@ def infer(data: Union[str, pd.DataFrame]):
labels = None
if is_classification:
labels = aml.classes_ if isinstance(aml.classes_, list) else aml.classes_.tolist()
log.info(f"Finished predict in {predict.duration}s.")

return result(
output_file=config.output_predictions_file,
probabilities=probabilities,
Expand Down
5 changes: 5 additions & 0 deletions frameworks/lightautoml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def run(dataset, config):
log.info("Training...")
with Timer() as training:
automl.fit_predict(train_data=df_train, roles={'target': label})
log.info(f"Finished fit in {training.duration}s.")

def infer(data: Union[str, pd.DataFrame]):
batch = pd.read_parquet(data) if isinstance(data, str) else data
Expand All @@ -52,6 +53,8 @@ def infer(data: Union[str, pd.DataFrame]):
infer,
[(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")


log.info("Predicting on the test set...")
with Timer() as predict:
Expand Down Expand Up @@ -81,6 +84,8 @@ def infer(data: Union[str, pd.DataFrame]):

log.debug(probabilities)
log.debug(config.output_predictions_file)
log.info(f"Finished predict in {predict.duration}s.")


save_artifacts(automl, config)

Expand Down
3 changes: 3 additions & 0 deletions frameworks/mljarsupervised/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def run(dataset, config):

with Timer() as training:
automl.fit(X_train, y_train)
log.info(f"Finished fit in {training.duration}s.")


def infer(data: Union[str, pd.DataFrame]):
Expand All @@ -71,6 +72,7 @@ def infer(data: Union[str, pd.DataFrame]):
infer,
[(1, dataset.test.X.sample(1, random_state=i)) for i in range(100)],
)
log.info(f"Finished inference time measurements.")

with Timer() as predict:
X_test, y_test = dataset.test.X, dataset.test.y.squeeze()
Expand All @@ -92,6 +94,7 @@ def infer(data: Union[str, pd.DataFrame]):
probabilities = preds[probabilities_labels].values
else:
predictions = preds["prediction"].values
log.info(f"Finished predict in {predict.duration}s.")

# clean the results
if not config.framework_params.get("_save_artifacts", False):
Expand Down

0 comments on commit 3542b07

Please sign in to comment.