From 9367f73e3dec075a1ee64df47f171d25581629f6 Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Wed, 2 Oct 2019 17:01:13 -0700 Subject: [PATCH 1/5] Save predictor_model when pickling a pipeline. --- src/python/nimbusml/pipeline.py | 34 ++++++++++++--- .../nimbusml/tests/pipeline/test_load_save.py | 43 ++++++++++++++++++- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 51c50ea2..899f8088 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -2482,7 +2482,7 @@ def load_model(self, src): self.steps = [] def __getstate__(self): - odict = {'export_version': 1} + odict = {'export_version': 2} if hasattr(self, 'steps'): odict['steps'] = self.steps @@ -2494,6 +2494,13 @@ def __getstate__(self): with open(self.model, "rb") as f: odict['modelbytes'] = f.read() + if (hasattr(self, 'predictor_model') and + self.predictor_model is not None and + os.path.isfile(self.predictor_model)): + + with open(self.predictor_model, "rb") as f: + odict['predictor_model_bytes'] = f.read() + return odict def __setstate__(self, state): @@ -2501,11 +2508,18 @@ def __setstate__(self, state): self.model = None self.random_state = None - for k, v in state.items(): - if k not in {'modelbytes', 'export_version'}: - setattr(self, k, v) + if state.get('export_version', 0) == 0: + # Pickled pipelines which were created + # before export_version was added used + # the default implementation which uses + # the instance’s __dict__. + if 'steps' in state: + self.steps = state['steps'] + + elif state.get('export_version', 0) in {1, 2}: + if 'steps' in state: + self.steps = state['steps'] - if state.get('export_version', 0) == 1: if 'modelbytes' in state: (fd, modelfile) = tempfile.mkstemp() fl = os.fdopen(fd, "wb") @@ -2513,6 +2527,16 @@ def __setstate__(self, state): fl.close() self.model = modelfile + if 'predictor_model_bytes' in state: + (fd, modelfile) = tempfile.mkstemp() + fl = os.fdopen(fd, "wb") + fl.write(state['predictor_model_bytes']) + fl.close() + self.predictor_model = modelfile + + else: + raise ValueError('Pipeline version not supported.') + @trace def score( self, diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index fc112fe5..1d514959 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -7,10 +7,13 @@ import pickle import unittest +import numpy as np +import pandas as pd + from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer -from nimbusml.linear_model import FastLinearBinaryClassifier +from nimbusml.linear_model import FastLinearBinaryClassifier, OnlineGradientDescentRegressor from nimbusml.utils import get_X_y from numpy.testing import assert_almost_equal @@ -326,5 +329,43 @@ def test_predictor_loaded_from_zip_has_feature_contributions(self): os.remove(model_filename) + def test_pickled_pipeline_with_predictor_model(self): + train_data = {'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + test_data = {'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7]} + test_df = pd.DataFrame(test_data).astype({'c1': np.float64, + 'c2': np.float64}) + + # Create predictor model and use it to predict + pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) + pipeline.fit(train_df, output_predictor_model=True) + result_1 = pipeline.predict(test_df) + + self.assertTrue(pipeline.model) + self.assertTrue(pipeline.predictor_model) + self.assertNotEqual(pipeline.model, pipeline.predictor_model) + + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(pipeline, f) + + os.remove(pipeline.model) + os.remove(pipeline.predictor_model) + + with open(pickle_filename, "rb") as f: + pipeline_pickle = pickle.load(f) + + os.remove(pickle_filename) + + # Load predictor pipeline and score data + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(pipeline_pickle.predictor_model) + result_2 = predictor_pipeline.predict(test_df) + + self.assertTrue(result_1.equals(result_2)) + + if __name__ == '__main__': unittest.main() From b45e855d3f1275d4b00a42af90e8bdec0d7a6101 Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Wed, 2 Oct 2019 19:01:31 -0700 Subject: [PATCH 2/5] Add whitespace to restart the ci run because the linux builds never completed. --- src/python/nimbusml/tests/pipeline/test_load_save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index 1d514959..19bc26ce 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -338,7 +338,7 @@ def test_pickled_pipeline_with_predictor_model(self): test_df = pd.DataFrame(test_data).astype({'c1': np.float64, 'c2': np.float64}) - # Create predictor model and use it to predict + # Create predictor model and use it to predict pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) pipeline.fit(train_df, output_predictor_model=True) result_1 = pipeline.predict(test_df) From 665f5f579901b9a281cc916a4ccc2db0537ca96a Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Wed, 2 Oct 2019 19:27:35 -0700 Subject: [PATCH 3/5] Whitespace change to restart the ci run because a linux build failed unusually. --- src/python/nimbusml/tests/pipeline/test_load_save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index 19bc26ce..1d514959 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -338,7 +338,7 @@ def test_pickled_pipeline_with_predictor_model(self): test_df = pd.DataFrame(test_data).astype({'c1': np.float64, 'c2': np.float64}) - # Create predictor model and use it to predict + # Create predictor model and use it to predict pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) pipeline.fit(train_df, output_predictor_model=True) result_1 = pipeline.predict(test_df) From 9a288d3488b21fe8764105645c1bceb67aa3360e Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 09:41:43 -0700 Subject: [PATCH 4/5] Whitespace modification to restart ci build. --- src/python/nimbusml/tests/pipeline/test_load_save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index 1d514959..19bc26ce 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -338,7 +338,7 @@ def test_pickled_pipeline_with_predictor_model(self): test_df = pd.DataFrame(test_data).astype({'c1': np.float64, 'c2': np.float64}) - # Create predictor model and use it to predict + # Create predictor model and use it to predict pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) pipeline.fit(train_df, output_predictor_model=True) result_1 = pipeline.predict(test_df) From 8437cedf7409050ddc344f299a74e2144a5bb548 Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 12:18:16 -0700 Subject: [PATCH 5/5] Remove non-ascii character which was accidentally copy and pasted in to comment. --- src/python/nimbusml/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 899f8088..21983d4a 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -2512,7 +2512,7 @@ def __setstate__(self, state): # Pickled pipelines which were created # before export_version was added used # the default implementation which uses - # the instance’s __dict__. + # the instances __dict__. if 'steps' in state: self.steps = state['steps']