Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Fix bug in Pipeline.transform() (#294)
Browse files Browse the repository at this point in the history
* Remove unnecessary code from Pipeline.transform that was causing a bug

* Update release-next.md

* Remove y argument from transform() method

* Update release-next.md

* Fix test
najeeb-kazmi authored Oct 4, 2019
1 parent 30c2cff commit ba3fe4f
Showing 6 changed files with 41 additions and 20 deletions.
9 changes: 8 additions & 1 deletion release-next.md
Original file line number Diff line number Diff line change
@@ -40,9 +40,16 @@
[PR#232](https://github.com/microsoft/NimbusML/pull/232)
Enable passing python executable to dataprep package, so dataprep can execute python transformations

- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided **

[PR#294](https://github.com/microsoft/NimbusML/pull/294)
Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided

## **Breaking Changes**

None.
- **Removed `y` parameter from `Pipeline.transform()`**
[PR#294](https://github.com/microsoft/NimbusML/pull/294)
Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`.

## **Enhancements**

1 change: 1 addition & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
@@ -686,6 +686,7 @@
<Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_combining.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_transform_method.py" />
<Compile Include="nimbusml\tests\preprocessing\normalization\test_lpscaler.py" />
<Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
<Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
Original file line number Diff line number Diff line change
@@ -2,9 +2,8 @@
# LightLda: cluster topics
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import LightLda
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
topics = pandas.DataFrame(data=dict(review=[
@@ -19,7 +18,7 @@

# there are three main topics in our data. set num_topic=3
# and see if LightLDA vectors for topics look similar
pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram(
), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
y = pipeline.fit_transform(topics)

Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
# Example with TextTransform and LogisticRegressionBinaryClassifier
import pandas
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.linear_model import LogisticRegressionBinaryClassifier

train_reviews = pandas.DataFrame(
@@ -77,7 +77,7 @@
y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review'
X = ngram.fit_transform(X)

# view the transformed numerical values and column names
14 changes: 1 addition & 13 deletions src/python/nimbusml/pipeline.py
Original file line number Diff line number Diff line change
@@ -2254,7 +2254,6 @@ def test(
def transform(
self,
X,
y=None,
verbose=0,
as_binary_data_stream=False,
**params):
@@ -2275,18 +2274,7 @@ def transform(
"Model is not fitted. Train or load a model before test("
").")

if y is not None:
if len(self.steps) > 0:
last_node = self.last_node
if last_node.type == 'transform':
raise ValueError(
"Pipeline needs a trainer as last step for test()")

X, y_temp, columns_renamed, feature_columns, label_column, \
schema, weights, weight_column = self._preprocess_X_y(X, y)

if not isinstance(y, (str, tuple)):
y = y_temp
X, _, _, _, _, schema, _, _ = self._preprocess_X_y(X)

all_nodes = []

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
import unittest

import pandas
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer

path = get_dataset("wiki_detox_train").as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
df = data.to_df().head()
X = df['SentimentText']

class TestPipelineTransformMethod(unittest.TestCase):

def test_transform_only_pipeline_transform_method(self):
p = Pipeline([NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
p.fit(X)
xf = p.transform(X)
assert 'SentimentText.==rude==' in xf.columns

if __name__ == '__main__':
unittest.main()

0 comments on commit ba3fe4f

Please sign in to comment.