intel · seraphimstreets · Apr 14, 2022 · Apr 14, 2022 · Apr 14, 2022
diff --git a/dffml/plugins.py b/dffml/plugins.py
@@ -32,6 +32,7 @@ def inpath(binary):
     ("model", "pytorch"),
     ("model", "spacy"),
     ("model", "daal4py"),
+    ("model", "h2o"),
 ]
 
 # Models which currently don't support Windows or MacOS

diff --git a/model/h2o/.coveragerc b/model/h2o/.coveragerc
@@ -0,0 +1,13 @@
+[run]
+source =
+    dffml_model_h2o
+    tests
+branch = True
+
+[report]
+exclude_lines =
+    no cov
+    no qa
+    noqa
+    pragma: no cover
+    if __name__ == .__main__.:
diff --git a/model/h2o/.gitignore b/model/h2o/.gitignore
@@ -0,0 +1,20 @@
+*.log
+*.pyc
+.cache/
+.coverage
+.idea/
+.vscode/
+*.egg-info/
+build/
+dist/
+docs/build/
+venv/
+wheelhouse/
+*.egss
+.mypy_cache/
+*.swp
+.venv/
+.eggs/
+*.modeldir
+*.db
+htmlcov/
diff --git a/model/h2o/LICENSE b/model/h2o/LICENSE
@@ -0,0 +1,21 @@
+Copyright (c) 2020 Intel
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/model/h2o/MANIFEST.in b/model/h2o/MANIFEST.in
@@ -0,0 +1,3 @@
+include README.md
+include LICENSE
+include setup_common.py
diff --git a/model/h2o/README.md b/model/h2o/README.md
@@ -0,0 +1,10 @@
+# DFFML Model for H2O-AutoML
+
+## About
+
+H2O's AutoML models.
+
+## License
+
+H2O-AutoML models are distributed under the terms of the
+[MIT License](LICENSE).
diff --git a/model/h2o/dffml_model_autoh2o/__init__.py b/model/h2o/dffml_model_autoh2o/__init__.py
@@ -0,0 +1 @@
+
diff --git a/model/h2o/dffml_model_autoh2o/autoh2o.py b/model/h2o/dffml_model_autoh2o/autoh2o.py
@@ -0,0 +1,285 @@
+from typing import AsyncIterator
+import pandas as pd
+import h2o
+import os
+import sklearn
+from h2o.sklearn import H2OAutoMLClassifier, H2OAutoMLRegressor
+from .config import AutoH2OConfig
+from dffml.model.model import Model
+from dffml import (
+    Sources,
+    Record,
+    SourcesContext,
+    ModelContext,
+    ModelNotTrained,
+)
+from dffml.util.entrypoint import entrypoint
+
+
+class AutoH2OModelContext(ModelContext):
+    """
+    H2O AutoML based model context 
+    """
+
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.features = self._get_feature_names()
+
+    def _get_feature_names(self):
+        return [name for name in self.parent.config.features.names()]
+
+    async def get_test_records(self, sources: SourcesContext):
+        ret_record = []
+        async for record in sources.with_features(self.features):
+            ret_record.append(record)
+        return ret_record
+
+    async def get_predictions(self, data):
+        return self.parent.best_model.predict(h2o.H2OFrame(data)).as_data_frame().iloc[:, 0].tolist()
+
+    async def get_best_model(self):
+        if not self.is_trained:
+            raise ModelNotTrained(
+                "Train the model first before getting predictions"
+            )
+        return self.parent.clf.get_best_model()
+
+    async def accuracy_score(self, y_test, predictions):
+        return sklearn.metrics.r2_score(y_test, predictions)
+
+    async def get_probabilities(self, data):
+        return [[float("nan")] * len(data)]
+
+    async def train(self, sources: Sources):
+        all_data = []
+        async for record in sources.with_features(
+            self.features + [self.parent.config.predict.name]
+        ):
+            all_data.append(record.features())
+
+        df = pd.DataFrame(all_data)
+        y = self.parent.config.predict.name
+        self.parent.clf.fit(df[self.features], pd.DataFrame(df[y]))
+        if(self.parent.config.show_leaderboard):
+            print(h2o.automl.get_leaderboard(self.parent.clf.estimator, extra_columns = "ALL"))
+        self.is_trained = True
+
+    async def predict(self, sources: SourcesContext) -> AsyncIterator[Record]:
+        if not self.is_trained:
+            raise ModelNotTrained(
+                "Train the model first before getting predictions"
+            )
+        test_records = await self.get_test_records(sources)
+        x_test = pd.DataFrame(
+            [record.features(self.features) for record in test_records]
+        )
+        predictions = await self.get_predictions(x_test)
+        probability = await self.get_probabilities(x_test)
+        target = self.parent.config.predict.name
+        for record, predict, prob in zip(
+            test_records, predictions, probability
+        ):
+            record.predicted(target, predict, max(prob))
+            yield record
+
+
+
+@entrypoint("h2o")
+class AutoH2OModel(Model):
+
+    r"""
+    ``h2o`` / ``AutoH2OModel`` will use ``H2O.ai's`` AutoML Python API
+    to train a model for you.
+
+    This is AutoML, it will train and tune hyperparameters from a list of models,
+    and return the best model.
+
+    Implemented using ``H2O``'s AutoML Python API (https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html).
+
+    In this version, you must specify the ML task that you wish to perform (either "regression" or "classification").
+
+    Here, we will show a small example using regression. First we create the training and testing datasets:
+
+    **train.csv**
+
+    .. code-block::
+        :test:
+        :filepath: train.csv
+
+        Feature1,Feature2,TARGET
+        0.93,0.68,3.89
+        0.24,0.42,1.75
+        0.36,0.68,2.75
+        0.53,0.31,2.00
+        0.29,0.25,1.32
+        0.29,0.52,2.14
+
+    **test.csv**
+
+    .. code-block::
+        :test:
+        :filepath: test.csv
+
+        Feature1,Feature2,TARGET
+        0.57,0.84,3.65
+        0.95,0.19,2.46
+        0.23,0.15,0.93
+
+    Train the model
+
+    .. code-block:: console
+        :test:
+
+        $ dffml train \
+            -model h2o \
+            -model-predict TARGET:float:1 \
+            -model-clstype int \
+            -sources f=csv \
+            -source-filename train.csv \
+            -model-features \
+              Feature1:float:1 \
+              Feature2:float:1 \
+            -model-location tempdir \
+            -log debug
+
+    Assess the accuracy
+
+    .. code-block:: console
+        :test:
+
+        $ dffml accuracy \
+            -model h2o \
+            -model-predict TARGET:float:1 \
+            -model-location tempdir \
+            -features TARGET:float:1 \
+            -sources f=csv \
+            -source-filename test.csv \
+            -model-features \
+              Feature1:float:1 \
+              Feature2:float:1 \
+            -scorer mse \
+            -log critical
+        0.9961211434899032
+
+    Make a file containing the data to predict on
+
+    **predict.csv**
+
+    .. code-block::
+        :test:
+        :filepath: predict.csv
+
+        Feature1,Feature2
+        0.57,0.84
+
+    Make a prediction
+
+    .. code-block:: console
+        :test:
+
+        $ dffml predict all \
+            -model h2o \
+            -model-location tempdir \
+            -model-predict TARGET:float:1 \
+            -sources iris=csv \
+            -model-features \
+              Feature1:float:1 \
+              Feature2:float:1 \
+            -source-filename predict.csv
+        [
+            {
+                "extra": {},
+                "features": {
+                    "Feature1": 0.57,
+                    "Feature2": 0.84
+                },
+                "key": "0",
+                "last_updated": "2020-11-23T05:52:13Z",
+                "prediction": {
+                    "TARGET": {
+                        "confidence": NaN,
+                        "value": 3.566799074411392
+                    }
+                }
+            }
+        ]
+
+    The model can be trained on large datasets to get better accuracy
+    output. The example shown above is to demonstrate the command line usage
+    of the model.
+
+
+    """
+
+
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        h2o.init()
+        self.clf = None
+        self.best_model = None
+
+
+
+
+    async def __aenter__(self) -> "AutoH2OModel":
+        await super().__aenter__()
+        self.path =  os.path.abspath(self.filepath(self.location, "trained_model"))
+        self.load_model()
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        if self.clf.estimator and self.clf.estimator.leader:
+            self.clf.estimator.leader.save_mojo(self.path.__str__())
+        await super().__aexit__(exc_type, exc_value, traceback)
+
+    def filepath(self, location, file):
+        return location / file
+
+    def load_model(self):
+
+
+        if(os.path.isdir(self.path)):
+
+            self.best_model = h2o.import_mojo(self.path.__str__())
+            self.is_trained = True
+
+        #TO-DO: split into separate regression/classification Models
+
+        if(self.config.task == "regression"):
+            self.clf = H2OAutoMLRegressor(
+                max_models = self.config.max_models,
+                seed=1,
+                max_runtime_secs = self.config.max_runtime_secs,
+                nfolds = self.config.nfolds,
+                max_runtime_secs_per_model = self.config.max_runtime_secs_per_model,
+                stopping_metric=self.config.stopping_metric,
+                stopping_tolerance=self.config.stopping_tolerance,
+                stopping_rounds=self.config.stopping_rounds,
+                sort_metric=self.config.sort_metric,
+                exclude_algos=self.config.exclude_algos,
+                include_algos=self.config.include_algos,
+                verbosity=self.config.verbosity,        
+            )
+        else:
+            self.clf = H2OAutoMLClassifier(
+                max_models = self.config.max_models,
+                seed=1,
+                max_runtime_secs = self.config.max_runtime_secs,
+                nfolds = self.config.nfolds,
+                balance_classes = self.config.balance_classes,
+                max_after_balance_size = self.config.max_after_balance_size,
+                max_runtime_secs_per_model = self.config.max_runtime_secs_per_model,
+                stopping_metric=self.config.stopping_metric,
+                stopping_tolerance=self.config.stopping_tolerance,
+                stopping_rounds=self.config.stopping_rounds,
+                sort_metric=self.config.sort_metric,
+                exclude_algos=self.config.exclude_algos,
+                include_algos=self.config.include_algos,
+                verbosity=self.config.verbosity,
+
+
+            )
+
+
+    CONFIG = AutoH2OConfig
+    CONTEXT = AutoH2OModelContext