From ba3fe4fdea40e4b9c78fad12fd36e1bc619b1ccd Mon Sep 17 00:00:00 2001
From: Najeeb Kazmi <najeeb.kazmi@gmail.com>
Date: Fri, 4 Oct 2019 16:09:26 -0700
Subject: [PATCH] Fix bug in Pipeline.transform() (#294)

* Remove unnecessary code from Pipeline.transform that was causing a bug

* Update release-next.md

* Remove y argument from transform() method

* Update release-next.md

* Fix test
---
 release-next.md                               |  9 ++++++-
 src/python/nimbusml.pyproj                    |  1 +
 .../examples_from_dataframe/LightLda_df.py    |  7 +++--
 .../NGramFeaturizer_df.py                     |  4 +--
 src/python/nimbusml/pipeline.py               | 14 +---------
 .../test_pipeline_transform_method.py         | 26 +++++++++++++++++++
 6 files changed, 41 insertions(+), 20 deletions(-)
 create mode 100644 src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py
diff --git a/release-next.md b/release-next.md
index 298add8f..8ad35e66 100644
--- a/release-next.md
+++ b/release-next.md
@@ -40,9 +40,16 @@
     [PR#232](https://github.com/microsoft/NimbusML/pull/232)
     Enable passing python executable to dataprep package, so dataprep can execute python transformations
 
+- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided **
+
+    [PR#294](https://github.com/microsoft/NimbusML/pull/294)
+    Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided 
+
 ## **Breaking Changes**
 
-None.
+- **Removed `y` parameter from `Pipeline.transform()`**
+    [PR#294](https://github.com/microsoft/NimbusML/pull/294)
+    Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`.
 
 ## **Enhancements**
 
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index f21bc0c3..345e7ccf 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -686,6 +686,7 @@
     <Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
     <Compile Include="nimbusml\tests\pipeline\test_pipeline_combining.py" />
     <Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
+    <Compile Include="nimbusml\tests\pipeline\test_pipeline_transform_method.py" />
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_lpscaler.py" />
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
     <Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py
index fd4df05b..c4a35a8f 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py
@@ -2,9 +2,8 @@
 # LightLda: cluster topics
 import pandas
 from nimbusml import Pipeline
-from nimbusml.feature_extraction.text import LightLda
-from nimbusml.feature_extraction.text import NGramFeaturizer
-from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
+from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer
+from nimbusml.feature_extraction.text.extractor import Ngram
 
 # create the data
 topics = pandas.DataFrame(data=dict(review=[
@@ -19,7 +18,7 @@
 
 # there are three main topics in our data. set num_topic=3
 # and see if LightLDA vectors for topics look similar
-pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
+pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram(
 ), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
 y = pipeline.fit_transform(topics)
 
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py
index e87b8168..e6cc14d1 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py
@@ -2,7 +2,7 @@
 # Example with TextTransform and LogisticRegressionBinaryClassifier
 import pandas
 from nimbusml.feature_extraction.text import NGramFeaturizer
-from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
+from nimbusml.feature_extraction.text.extractor import Ngram
 from nimbusml.linear_model import LogisticRegressionBinaryClassifier
 
 train_reviews = pandas.DataFrame(
@@ -77,7 +77,7 @@
 y = train_reviews['like']
 X = train_reviews.loc[:, train_reviews.columns != 'like']
 
-ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
+ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review'
 X = ngram.fit_transform(X)
 
 # view the transformed numerical values and column names
diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py
index 9b1d7216..b3be72f8 100644
--- a/src/python/nimbusml/pipeline.py
+++ b/src/python/nimbusml/pipeline.py
@@ -2254,7 +2254,6 @@ def test(
     def transform(
             self,
             X,
-            y=None,
             verbose=0,
             as_binary_data_stream=False,
             **params):
@@ -2275,18 +2274,7 @@ def transform(
                 "Model is not fitted. Train or load a model before test("
                 ").")
 
-        if y is not None:
-            if len(self.steps) > 0:
-                last_node = self.last_node
-                if last_node.type == 'transform':
-                    raise ValueError(
-                        "Pipeline needs a trainer as last step for test()")
-
-        X, y_temp, columns_renamed, feature_columns, label_column, \
-            schema, weights, weight_column = self._preprocess_X_y(X, y)
-
-        if not isinstance(y, (str, tuple)):
-            y = y_temp
+        X, _, _, _, _, schema, _, _ = self._preprocess_X_y(X)
 
         all_nodes = []
 
diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py b/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py
new file mode 100644
index 00000000..e16a1e99
--- /dev/null
+++ b/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py
@@ -0,0 +1,26 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import unittest
+
+import pandas
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.feature_extraction.text import NGramFeaturizer
+
+path = get_dataset("wiki_detox_train").as_filepath()
+data = FileDataStream.read_csv(path, sep='\t')
+df = data.to_df().head()
+X = df['SentimentText']
+
+class TestPipelineTransformMethod(unittest.TestCase):
+
+    def test_transform_only_pipeline_transform_method(self):
+        p = Pipeline([NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
+        p.fit(X)
+        xf = p.transform(X)
+        assert 'SentimentText.==rude==' in xf.columns
+
+if __name__ == '__main__':
+    unittest.main()