From ba3fe4fdea40e4b9c78fad12fd36e1bc619b1ccd Mon Sep 17 00:00:00 2001 From: Najeeb Kazmi Date: Fri, 4 Oct 2019 16:09:26 -0700 Subject: [PATCH] Fix bug in Pipeline.transform() (#294) * Remove unnecessary code from Pipeline.transform that was causing a bug * Update release-next.md * Remove y argument from transform() method * Update release-next.md * Fix test --- release-next.md | 9 ++++++- src/python/nimbusml.pyproj | 1 + .../examples_from_dataframe/LightLda_df.py | 7 +++-- .../NGramFeaturizer_df.py | 4 +-- src/python/nimbusml/pipeline.py | 14 +--------- .../test_pipeline_transform_method.py | 26 +++++++++++++++++++ 6 files changed, 41 insertions(+), 20 deletions(-) create mode 100644 src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py diff --git a/release-next.md b/release-next.md index 298add8f..8ad35e66 100644 --- a/release-next.md +++ b/release-next.md @@ -40,9 +40,16 @@ [PR#232](https://github.com/microsoft/NimbusML/pull/232) Enable passing python executable to dataprep package, so dataprep can execute python transformations +- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided ** + + [PR#294](https://github.com/microsoft/NimbusML/pull/294) + Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided + ## **Breaking Changes** -None. +- **Removed `y` parameter from `Pipeline.transform()`** + [PR#294](https://github.com/microsoft/NimbusML/pull/294) + Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`. ## **Enhancements** diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index f21bc0c3..345e7ccf 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -686,6 +686,7 @@ + diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py index fd4df05b..c4a35a8f 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py @@ -2,9 +2,8 @@ # LightLda: cluster topics import pandas from nimbusml import Pipeline -from nimbusml.feature_extraction.text import LightLda -from nimbusml.feature_extraction.text import NGramFeaturizer -from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer +from nimbusml.feature_extraction.text.extractor import Ngram # create the data topics = pandas.DataFrame(data=dict(review=[ @@ -19,7 +18,7 @@ # there are three main topics in our data. set num_topic=3 # and see if LightLDA vectors for topics look similar -pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram( +pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram( ), vector_normalizer='None') << 'review', LightLda(num_topic=3)]) y = pipeline.fit_transform(topics) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py index e87b8168..e6cc14d1 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py @@ -2,7 +2,7 @@ # Example with TextTransform and LogisticRegressionBinaryClassifier import pandas from nimbusml.feature_extraction.text import NGramFeaturizer -from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.linear_model import LogisticRegressionBinaryClassifier train_reviews = pandas.DataFrame( @@ -77,7 +77,7 @@ y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] -ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review' +ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review' X = ngram.fit_transform(X) # view the transformed numerical values and column names diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 9b1d7216..b3be72f8 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -2254,7 +2254,6 @@ def test( def transform( self, X, - y=None, verbose=0, as_binary_data_stream=False, **params): @@ -2275,18 +2274,7 @@ def transform( "Model is not fitted. Train or load a model before test(" ").") - if y is not None: - if len(self.steps) > 0: - last_node = self.last_node - if last_node.type == 'transform': - raise ValueError( - "Pipeline needs a trainer as last step for test()") - - X, y_temp, columns_renamed, feature_columns, label_column, \ - schema, weights, weight_column = self._preprocess_X_y(X, y) - - if not isinstance(y, (str, tuple)): - y = y_temp + X, _, _, _, _, schema, _, _ = self._preprocess_X_y(X) all_nodes = [] diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py b/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py new file mode 100644 index 00000000..e16a1e99 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py @@ -0,0 +1,26 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import pandas +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.text import NGramFeaturizer + +path = get_dataset("wiki_detox_train").as_filepath() +data = FileDataStream.read_csv(path, sep='\t') +df = data.to_df().head() +X = df['SentimentText'] + +class TestPipelineTransformMethod(unittest.TestCase): + + def test_transform_only_pipeline_transform_method(self): + p = Pipeline([NGramFeaturizer(char_feature_extractor=None) << 'SentimentText']) + p.fit(X) + xf = p.transform(X) + assert 'SentimentText.==rude==' in xf.columns + +if __name__ == '__main__': + unittest.main()