diff --git a/dffml/plugins.py b/dffml/plugins.py index 8e4f7e2ec2..e02704efd3 100644 --- a/dffml/plugins.py +++ b/dffml/plugins.py @@ -32,6 +32,7 @@ def inpath(binary): ("model", "pytorch"), ("model", "spacy"), ("model", "daal4py"), + ("model", "h2o"), ] # Models which currently don't support Windows or MacOS diff --git a/model/h2o/.coveragerc b/model/h2o/.coveragerc new file mode 100644 index 0000000000..a65850eed9 --- /dev/null +++ b/model/h2o/.coveragerc @@ -0,0 +1,13 @@ +[run] +source = + dffml_model_h2o + tests +branch = True + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/model/h2o/.gitignore b/model/h2o/.gitignore new file mode 100644 index 0000000000..070ee81c83 --- /dev/null +++ b/model/h2o/.gitignore @@ -0,0 +1,20 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.egss +.mypy_cache/ +*.swp +.venv/ +.eggs/ +*.modeldir +*.db +htmlcov/ diff --git a/model/h2o/LICENSE b/model/h2o/LICENSE new file mode 100644 index 0000000000..805b69fb3e --- /dev/null +++ b/model/h2o/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2020 Intel + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/model/h2o/MANIFEST.in b/model/h2o/MANIFEST.in new file mode 100644 index 0000000000..19f3196490 --- /dev/null +++ b/model/h2o/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include LICENSE +include setup_common.py diff --git a/model/h2o/README.md b/model/h2o/README.md new file mode 100644 index 0000000000..b556a0b3a7 --- /dev/null +++ b/model/h2o/README.md @@ -0,0 +1,10 @@ +# DFFML Model for H2O-AutoML + +## About + +H2O's AutoML models. + +## License + +H2O-AutoML models are distributed under the terms of the +[MIT License](LICENSE). diff --git a/model/h2o/dffml_model_autoh2o/__init__.py b/model/h2o/dffml_model_autoh2o/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/model/h2o/dffml_model_autoh2o/__init__.py @@ -0,0 +1 @@ + diff --git a/model/h2o/dffml_model_autoh2o/autoh2o.py b/model/h2o/dffml_model_autoh2o/autoh2o.py new file mode 100644 index 0000000000..6b13607429 --- /dev/null +++ b/model/h2o/dffml_model_autoh2o/autoh2o.py @@ -0,0 +1,285 @@ +from typing import AsyncIterator +import pandas as pd +import h2o +import os +import sklearn +from h2o.sklearn import H2OAutoMLClassifier, H2OAutoMLRegressor +from .config import AutoH2OConfig +from dffml.model.model import Model +from dffml import ( + Sources, + Record, + SourcesContext, + ModelContext, + ModelNotTrained, +) +from dffml.util.entrypoint import entrypoint + + +class AutoH2OModelContext(ModelContext): + """ + H2O AutoML based model context + """ + + def __init__(self, parent): + super().__init__(parent) + self.features = self._get_feature_names() + + def _get_feature_names(self): + return [name for name in self.parent.config.features.names()] + + async def get_test_records(self, sources: SourcesContext): + ret_record = [] + async for record in sources.with_features(self.features): + ret_record.append(record) + return ret_record + + async def get_predictions(self, data): + return self.parent.best_model.predict(h2o.H2OFrame(data)).as_data_frame().iloc[:, 0].tolist() + + async def get_best_model(self): + if not self.is_trained: + raise ModelNotTrained( + "Train the model first before getting predictions" + ) + return self.parent.clf.get_best_model() + + async def accuracy_score(self, y_test, predictions): + return sklearn.metrics.r2_score(y_test, predictions) + + async def get_probabilities(self, data): + return [[float("nan")] * len(data)] + + async def train(self, sources: Sources): + all_data = [] + async for record in sources.with_features( + self.features + [self.parent.config.predict.name] + ): + all_data.append(record.features()) + + df = pd.DataFrame(all_data) + y = self.parent.config.predict.name + self.parent.clf.fit(df[self.features], pd.DataFrame(df[y])) + if(self.parent.config.show_leaderboard): + print(h2o.automl.get_leaderboard(self.parent.clf.estimator, extra_columns = "ALL")) + self.is_trained = True + + async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]: + if not self.is_trained: + raise ModelNotTrained( + "Train the model first before getting predictions" + ) + test_records = await self.get_test_records(sources) + x_test = pd.DataFrame( + [record.features(self.features) for record in test_records] + ) + predictions = await self.get_predictions(x_test) + probability = await self.get_probabilities(x_test) + target = self.parent.config.predict.name + for record, predict, prob in zip( + test_records, predictions, probability + ): + record.predicted(target, predict, max(prob)) + yield record + + + +@entrypoint("h2o") +class AutoH2OModel(Model): + + r""" + ``h2o`` / ``AutoH2OModel`` will use ``H2O.ai's`` AutoML Python API + to train a model for you. + + This is AutoML, it will train and tune hyperparameters from a list of models, + and return the best model. + + Implemented using ``H2O``'s AutoML Python API (https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html). + + In this version, you must specify the ML task that you wish to perform (either "regression" or "classification"). + + Here, we will show a small example using regression. First we create the training and testing datasets: + + **train.csv** + + .. code-block:: + :test: + :filepath: train.csv + + Feature1,Feature2,TARGET + 0.93,0.68,3.89 + 0.24,0.42,1.75 + 0.36,0.68,2.75 + 0.53,0.31,2.00 + 0.29,0.25,1.32 + 0.29,0.52,2.14 + + **test.csv** + + .. code-block:: + :test: + :filepath: test.csv + + Feature1,Feature2,TARGET + 0.57,0.84,3.65 + 0.95,0.19,2.46 + 0.23,0.15,0.93 + + Train the model + + .. code-block:: console + :test: + + $ dffml train \ + -model h2o \ + -model-predict TARGET:float:1 \ + -model-clstype int \ + -sources f=csv \ + -source-filename train.csv \ + -model-features \ + Feature1:float:1 \ + Feature2:float:1 \ + -model-location tempdir \ + -log debug + + Assess the accuracy + + .. code-block:: console + :test: + + $ dffml accuracy \ + -model h2o \ + -model-predict TARGET:float:1 \ + -model-location tempdir \ + -features TARGET:float:1 \ + -sources f=csv \ + -source-filename test.csv \ + -model-features \ + Feature1:float:1 \ + Feature2:float:1 \ + -scorer mse \ + -log critical + 0.9961211434899032 + + Make a file containing the data to predict on + + **predict.csv** + + .. code-block:: + :test: + :filepath: predict.csv + + Feature1,Feature2 + 0.57,0.84 + + Make a prediction + + .. code-block:: console + :test: + + $ dffml predict all \ + -model h2o \ + -model-location tempdir \ + -model-predict TARGET:float:1 \ + -sources iris=csv \ + -model-features \ + Feature1:float:1 \ + Feature2:float:1 \ + -source-filename predict.csv + [ + { + "extra": {}, + "features": { + "Feature1": 0.57, + "Feature2": 0.84 + }, + "key": "0", + "last_updated": "2020-11-23T05:52:13Z", + "prediction": { + "TARGET": { + "confidence": NaN, + "value": 3.566799074411392 + } + } + } + ] + + The model can be trained on large datasets to get better accuracy + output. The example shown above is to demonstrate the command line usage + of the model. + + + """ + + + def __init__(self, config) -> None: + super().__init__(config) + h2o.init() + self.clf = None + self.best_model = None + + + + + async def __aenter__(self) -> "AutoH2OModel": + await super().__aenter__() + self.path = os.path.abspath(self.filepath(self.location, "trained_model")) + self.load_model() + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + if self.clf.estimator and self.clf.estimator.leader: + self.clf.estimator.leader.save_mojo(self.path.__str__()) + await super().__aexit__(exc_type, exc_value, traceback) + + def filepath(self, location, file): + return location / file + + def load_model(self): + + + if(os.path.isdir(self.path)): + + self.best_model = h2o.import_mojo(self.path.__str__()) + self.is_trained = True + + #TO-DO: split into separate regression/classification Models + + if(self.config.task == "regression"): + self.clf = H2OAutoMLRegressor( + max_models = self.config.max_models, + seed=1, + max_runtime_secs = self.config.max_runtime_secs, + nfolds = self.config.nfolds, + max_runtime_secs_per_model = self.config.max_runtime_secs_per_model, + stopping_metric=self.config.stopping_metric, + stopping_tolerance=self.config.stopping_tolerance, + stopping_rounds=self.config.stopping_rounds, + sort_metric=self.config.sort_metric, + exclude_algos=self.config.exclude_algos, + include_algos=self.config.include_algos, + verbosity=self.config.verbosity, + ) + else: + self.clf = H2OAutoMLClassifier( + max_models = self.config.max_models, + seed=1, + max_runtime_secs = self.config.max_runtime_secs, + nfolds = self.config.nfolds, + balance_classes = self.config.balance_classes, + max_after_balance_size = self.config.max_after_balance_size, + max_runtime_secs_per_model = self.config.max_runtime_secs_per_model, + stopping_metric=self.config.stopping_metric, + stopping_tolerance=self.config.stopping_tolerance, + stopping_rounds=self.config.stopping_rounds, + sort_metric=self.config.sort_metric, + exclude_algos=self.config.exclude_algos, + include_algos=self.config.include_algos, + verbosity=self.config.verbosity, + + + ) + + + CONFIG = AutoH2OConfig + CONTEXT = AutoH2OModelContext \ No newline at end of file diff --git a/model/h2o/dffml_model_autoh2o/config.py b/model/h2o/dffml_model_autoh2o/config.py new file mode 100644 index 0000000000..e1a136a61d --- /dev/null +++ b/model/h2o/dffml_model_autoh2o/config.py @@ -0,0 +1,67 @@ +import pathlib +from pathlib import Path +from typing import AsyncIterator, List +from dffml.base import config, field + +from dffml import ( + Feature, + Features, + field, + +) + +@config +class AutoH2OConfig: + features: Features + predict: Feature = field("Feature to predict") + location: Path = field("Location where state should be saved") + #TO-DO: split into separate regression/classification models + task: str = field( + "Task to perform, possible values are `classification`, `regression`", + default="regression", + ) + max_runtime_secs: int = field( + "The maximum time that the AutoML process will run", default=0 + ) + max_models: int = field( + "Maximum number of models to build in AutoML run", default=10 + ) + nfolds: int = field( + "Number of folds for k-fold cross-validation", default=5 + ) + balance_classes: bool = field( + "Oversampling on minority classes should be performed or not", + default=False, + ) + max_after_balance_size: int = field( + "Maximum relative size of training set after performing oversampling", + default=5, + ) + max_runtime_secs_per_model: int = field( + "Maximum time to train individual model in AutoML", default=0 + ) + stopping_metric: str = field( + "Metric used for stopping criteria", default="AUTO" + ) + stopping_tolerance: float = field( + "Specifies the relative tolerance for the metric-based stopping", + default=0.001, + ) + stopping_rounds: int = field( + "Stop training when metric doesn't improve max of stopping_rounds", + default=3, + ) + sort_metric: str = field( + "Metric used to sort the leaderboard", default="AUTO" + ) + exclude_algos: List[str] = field( + "Algorithm to skip during training", default=None + ) + include_algos: List[str] = field( + "Algorithm to be used during training", default=None + ) + verbosity: str = field("Print the backend messages", default=None) + show_leaderboard: bool = field( + "Print the leaderboard after the building the models in AutoML", + default=True, + ) diff --git a/model/h2o/dffml_model_autoh2o/version.py b/model/h2o/dffml_model_autoh2o/version.py new file mode 100644 index 0000000000..1cf6267ae5 --- /dev/null +++ b/model/h2o/dffml_model_autoh2o/version.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/model/h2o/examples/accuracy.sh b/model/h2o/examples/accuracy.sh new file mode 100644 index 0000000000..618a2490c0 --- /dev/null +++ b/model/h2o/examples/accuracy.sh @@ -0,0 +1,10 @@ +dffml accuracy \ + -model h2o \ + -scorer clf \ + -model-features x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 x21 x22 x23 x24 x25 x26 x27 x28 \ + -model-predict response \ + -model-location tempdir \ + -model-task classification \ + -sources f=csv \ + -source-filename higgs_test_5k.csv \ + -features response \ No newline at end of file diff --git a/model/h2o/examples/predict.sh b/model/h2o/examples/predict.sh new file mode 100644 index 0000000000..d4e3accd55 --- /dev/null +++ b/model/h2o/examples/predict.sh @@ -0,0 +1,8 @@ +dffml predict all \ + -model h2o \ + -model-features x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 x21 x22 x23 x24 x25 x26 x27 x28 \ + -model-predict response \ + -model-location tempdir \ + -model-task classification \ + -sources f=csv \ + -source-filename higgs_test_5k.csv \ No newline at end of file diff --git a/model/h2o/examples/predict_data.sh b/model/h2o/examples/predict_data.sh new file mode 100644 index 0000000000..1b0bb11162 --- /dev/null +++ b/model/h2o/examples/predict_data.sh @@ -0,0 +1,2 @@ +wget --user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv +sed -i "" *.csv \ No newline at end of file diff --git a/model/h2o/examples/regression/accuracy.sh b/model/h2o/examples/regression/accuracy.sh new file mode 100644 index 0000000000..edaa8b548e --- /dev/null +++ b/model/h2o/examples/regression/accuracy.sh @@ -0,0 +1,12 @@ +dffml accuracy \ + -model h2o \ + -model-predict TARGET:float:1 \ + -model-location tempdir \ + -features TARGET:float:1 \ + -sources f=csv \ + -source-filename test.csv \ + -model-features \ + Feature1:float:1 \ + Feature2:float:1 \ + -scorer mse \ + -log critical \ No newline at end of file diff --git a/model/h2o/examples/regression/predict.sh b/model/h2o/examples/regression/predict.sh new file mode 100644 index 0000000000..478077d56d --- /dev/null +++ b/model/h2o/examples/regression/predict.sh @@ -0,0 +1,9 @@ +dffml predict all \ + -model h2o \ + -model-location tempdir \ + -model-predict TARGET:float:1 \ + -sources iris=csv \ + -model-features \ + Feature1:float:1 \ + Feature2:float:1 \ + -source-filename predict.csv \ No newline at end of file diff --git a/model/h2o/examples/regression/predict_data.sh b/model/h2o/examples/regression/predict_data.sh new file mode 100644 index 0000000000..34ee4566fc --- /dev/null +++ b/model/h2o/examples/regression/predict_data.sh @@ -0,0 +1,4 @@ +cat > predict.csv << EOF +Feature1,Feature2 +0.57,0.84 +EOF \ No newline at end of file diff --git a/model/h2o/examples/regression/test_data.sh b/model/h2o/examples/regression/test_data.sh new file mode 100644 index 0000000000..ad902ed4cf --- /dev/null +++ b/model/h2o/examples/regression/test_data.sh @@ -0,0 +1,6 @@ +cat > test.csv << EOF +Feature1,Feature2,TARGET +0.57,0.84,3.65 +0.95,0.19,2.46 +0.23,0.15,0.93 +EOF \ No newline at end of file diff --git a/model/h2o/examples/regression/train.sh b/model/h2o/examples/regression/train.sh new file mode 100644 index 0000000000..7659dc6d9c --- /dev/null +++ b/model/h2o/examples/regression/train.sh @@ -0,0 +1,10 @@ +dffml train \ + -model h2o \ + -model-predict TARGET:float:1 \ + -model-clstype int \ + -sources f=csv \ + -source-filename train.csv \ + -model-features \ + Feature1:float:1 \ + Feature2:float:1 \ + -model-location tempdir \ No newline at end of file diff --git a/model/h2o/examples/regression/train_data.sh b/model/h2o/examples/regression/train_data.sh new file mode 100644 index 0000000000..b309577db3 --- /dev/null +++ b/model/h2o/examples/regression/train_data.sh @@ -0,0 +1,9 @@ +cat > train.csv << EOF +Feature1,Feature2,TARGET +0.93,0.68,3.89 +0.24,0.42,1.75 +0.36,0.68,2.75 +0.53,0.31,2.00 +0.29,0.25,1.32 +0.29,0.52,2.14 +EOF \ No newline at end of file diff --git a/model/h2o/examples/test_data.sh b/model/h2o/examples/test_data.sh new file mode 100644 index 0000000000..8f76f3c270 --- /dev/null +++ b/model/h2o/examples/test_data.sh @@ -0,0 +1,2 @@ +wget --user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv +sed -i "" *.csv \ No newline at end of file diff --git a/model/h2o/examples/train.sh b/model/h2o/examples/train.sh new file mode 100644 index 0000000000..3edd29af02 --- /dev/null +++ b/model/h2o/examples/train.sh @@ -0,0 +1,8 @@ +dffml train \ + -model h2o \ + -model-features x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x20 x21 x22 x23 x24 x25 x26 x27 x28 \ + -model-predict response \ + -model-location tempdir \ + -model-task classification \ + -sources f=csv \ + -source-filename higgs_train_10k.csv \ No newline at end of file diff --git a/model/h2o/examples/train_data.sh b/model/h2o/examples/train_data.sh new file mode 100644 index 0000000000..4adcc62a60 --- /dev/null +++ b/model/h2o/examples/train_data.sh @@ -0,0 +1,2 @@ +wget --user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36" https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv +sed -i "" *.csv \ No newline at end of file diff --git a/model/h2o/pyproject.toml b/model/h2o/pyproject.toml new file mode 100644 index 0000000000..8b9d32fa10 --- /dev/null +++ b/model/h2o/pyproject.toml @@ -0,0 +1,20 @@ +[tool.black] +line-length = 79 +target-version = ['py37'] + +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + ) +) +''' diff --git a/model/h2o/setup.cfg b/model/h2o/setup.cfg new file mode 100644 index 0000000000..04e4f6830c --- /dev/null +++ b/model/h2o/setup.cfg @@ -0,0 +1,13 @@ +[options] +zip_safe = False +include_package_data = True +packages = find: + +install_requires = + future >=0.15.2 + tabulate >=0.7.5 + requests >=2.10 + h2o >= 3.2.0 + pandas>=0.25.0 + dffml>=0.4.0 + scikit-learn>=0.21.2 diff --git a/model/h2o/setup.py b/model/h2o/setup.py new file mode 100644 index 0000000000..8f1c68ce5d --- /dev/null +++ b/model/h2o/setup.py @@ -0,0 +1,23 @@ +import os +import sys +import site +import importlib.util +from setuptools import setup + +# See https://github.com/pypa/pip/issues/7953 +site.ENABLE_USER_SITE = "--user" in sys.argv[1:] + +# Boilerplate to load commonalities +spec = importlib.util.spec_from_file_location( + "setup_common", os.path.join(os.path.dirname(__file__), "setup_common.py") +) +common = importlib.util.module_from_spec(spec) +spec.loader.exec_module(common) + +common.KWARGS["entry_points"] = { + "dffml.model": [ + "h2o = dffml_model_autoh2o.autoh2o:AutoH2OModel", + ] +} + +setup(**common.KWARGS) diff --git a/model/h2o/setup_common.py b/model/h2o/setup_common.py new file mode 100644 index 0000000000..9a02471672 --- /dev/null +++ b/model/h2o/setup_common.py @@ -0,0 +1,55 @@ +import os +import sys +import ast +from pathlib import Path + +ORG = "dffml" +NAME = "dffml-model-autoh2o" +DESCRIPTION = "DFFML model dffml-model-autoh2o" +AUTHOR_NAME = "Edison Siow" +AUTHOR_EMAIL = "edisonsiowxiong@gmail.com" + +IMPORT_NAME = ( + NAME + if "replace_package_name".upper() != NAME + else "replace_import_package_name".upper() +).replace("-", "_") + +SELF_PATH = Path(sys.argv[0]).parent.resolve() +if not (SELF_PATH / Path(IMPORT_NAME, "version.py")).is_file(): + SELF_PATH = os.path.dirname(os.path.realpath(__file__)) + +VERSION = ast.literal_eval( + Path(SELF_PATH, IMPORT_NAME, "version.py") + .read_text() + .split("=")[-1] + .strip() +) + +README = Path(SELF_PATH, "README.md").read_text() + +KWARGS = dict( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=README, + long_description_content_type="text/markdown", + author=AUTHOR_NAME, + author_email=AUTHOR_EMAIL, + maintainer=AUTHOR_NAME, + maintainer_email=AUTHOR_EMAIL, + url=f"https://github.com/{ORG}/{NAME}", + license="MIT", + keywords=["dffml"], + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], +) diff --git a/model/h2o/tests/__init__.py b/model/h2o/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/model/h2o/tests/test_h2o.py b/model/h2o/tests/test_h2o.py new file mode 100644 index 0000000000..9f1fbb4686 --- /dev/null +++ b/model/h2o/tests/test_h2o.py @@ -0,0 +1,102 @@ +import os +import tempfile +import contextlib +import subprocess + +from dffml.cli.cli import CLI +from dffml.util.os import chdir +from dffml.util.asynctestcase import AsyncTestCase + + +@contextlib.contextmanager +def directory_with_csv_files(): + with tempfile.TemporaryDirectory() as tempdir: + with chdir(tempdir): + subprocess.check_output( + [ + "bash", + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "train_data.sh", + ), + ] + ) + subprocess.check_output( + [ + "bash", + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "test_data.sh", + ), + ] + ) + subprocess.check_output( + [ + "bash", + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "predict_data.sh", + ), + ] + ) + yield tempdir + + +class TestH2OClassificationModel(AsyncTestCase): + async def test_run(self): + self.required_plugins("dffml-model-autoh2o") + + def clean_args(fd, directory): + cmnd = " ".join(fd.readlines()).split("\\\n") + cmnd = " ".join(cmnd).split() + for idx, word in enumerate(cmnd): + cmnd[idx] = word.strip() + cmnd[cmnd.index("-model-location") + 1] = directory + return cmnd + + with directory_with_csv_files() as tempdir: + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "train.sh", + ), + "r", + ) as f: + train_cmnd = clean_args(f, tempdir) + await CLI.cli(*train_cmnd[1:]) + + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "accuracy.sh", + ), + "r", + ) as f: + accuracy_cmnd = clean_args(f, tempdir) + await CLI.cli(*accuracy_cmnd[1:]) + + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "predict.sh", + ), + "r", + ) as f: + predict_cmnd = clean_args(f, tempdir) + results = await CLI._main(*predict_cmnd[1:]) + self.assertTrue(isinstance(results, list)) + self.assertTrue(results) + results = results[0].export() + self.assertIn("prediction", results) + results = results["prediction"] + self.assertIn("response", results) + results = results["response"] + self.assertIn("value", results) + self.assertEqual(results["value"], 0) + diff --git a/model/h2o/tests/test_h2o_regression.py b/model/h2o/tests/test_h2o_regression.py new file mode 100644 index 0000000000..30688a49de --- /dev/null +++ b/model/h2o/tests/test_h2o_regression.py @@ -0,0 +1,111 @@ +import os +import tempfile +import contextlib +import subprocess + +from dffml.cli.cli import CLI +from dffml.util.os import chdir +from dffml.util.asynctestcase import AsyncTestCase + + +@contextlib.contextmanager +def directory_with_csv_files(): + with tempfile.TemporaryDirectory() as tempdir: + with chdir(tempdir): + p = subprocess.check_output( + [ + "bash", + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "regression", + "train_data.sh", + ), + ] + ) + + subprocess.check_output( + [ + "bash", + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "regression", + "test_data.sh", + ), + ] + ) + + subprocess.check_output( + [ + "bash", + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "regression", + "predict_data.sh", + ), + ] + ) + + yield tempdir + + +class TestH2ORegressionModel(AsyncTestCase): + async def test_run(self): + self.required_plugins("dffml-model-autoh2o") + + def clean_args(fd, directory): + cmnd = " ".join(fd.readlines()).split("\\\n") + cmnd = " ".join(cmnd).split() + for idx, word in enumerate(cmnd): + cmnd[idx] = word.strip() + cmnd[cmnd.index("-model-location") + 1] = directory + return cmnd + + with directory_with_csv_files() as tempdir: + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "regression", + "train.sh", + ), + "r", + ) as f: + train_cmnd = clean_args(f, tempdir) + await CLI.cli(*train_cmnd[1:]) + + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "regression", + "accuracy.sh", + ), + "r", + ) as f: + accuracy_cmnd = clean_args(f, tempdir) + await CLI.cli(*accuracy_cmnd[1:]) + + with open( + os.path.join( + os.path.dirname(os.path.dirname(__file__)), + "examples", + "regression", + "predict.sh", + ), + "r", + ) as f: + predict_cmnd = clean_args(f, tempdir) + results = await CLI._main(*predict_cmnd[1:]) + self.assertTrue(isinstance(results, list)) + self.assertTrue(results) + results = results[0].export() + self.assertIn("prediction", results) + results = results["prediction"] + self.assertIn("TARGET", results) + results = results["TARGET"] + self.assertIn("value", results) + self.assertAlmostEqual(results["value"], 3.639936099771113, 2) +