Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions docs/ml-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -1212,7 +1212,7 @@ v_N
This example below demonstrates how to transform vectors using a transforming vector value.

<div class="codetabs">
<div data-lang="scala">
<div data-lang="scala" markdown="1">
{% highlight scala %}
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.mllib.linalg.Vectors
Expand All @@ -1229,12 +1229,12 @@ val transformer = new ElementwiseProduct()
.setOutputCol("transformedVector")

// Batch transform the vectors to create new column:
val transformedData = transformer.transform(dataFrame)
transformer.transform(dataFrame).show()

{% endhighlight %}
</div>

<div data-lang="java">
<div data-lang="java" markdown="1">
{% highlight java %}
import com.google.common.collect.Lists;

Expand Down Expand Up @@ -1267,10 +1267,25 @@ ElementwiseProduct transformer = new ElementwiseProduct()
.setInputCol("vector")
.setOutputCol("transformedVector");
// Batch transform the vectors to create new column:
DataFrame transformedData = transformer.transform(dataFrame);
transformer.transform(dataFrame).show();

{% endhighlight %}
</div>

<div data-lang="python" markdown="1">
{% highlight python %}
from pyspark.ml.feature import ElementwiseProduct
from pyspark.mllib.linalg import Vectors

data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
df = sqlContext.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
inputCol="vector", outputCol="transformedVector")
transformer.transform(df).show()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please update the Scala/Java examples to be the same? It's nice to call "show()" on the final result.


{% endhighlight %}
</div>

</div>

## VectorAssembler
Expand Down
67 changes: 62 additions & 5 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector

__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler',
'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA',
'PCAModel', 'RFormula', 'RFormulaModel']
__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']


@inherit_doc
Expand Down Expand Up @@ -166,6 +166,63 @@ def getSplits(self):
return self.getOrDefault(self.splits)


@inherit_doc
class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
"""
Outputs the Hadamard product (i.e., the element-wise product) of each input vector
with a provided "weight" vector. In other words, it scales each column of the dataset
by a scalar multiplier.

>>> from pyspark.mllib.linalg import Vectors
>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])
>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
... inputCol="values", outputCol="eprod")
>>> ep.transform(df).head().eprod
DenseVector([2.0, 2.0, 9.0])
>>> ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).head().eprod
DenseVector([4.0, 3.0, 15.0])
"""

# a placeholder to make it appear in the generated doc
scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
"it must be MLlib Vector type.")

@keyword_only
def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
"""
__init__(self, scalingVec=None, inputCol=None, outputCol=None)
"""
super(ElementwiseProduct, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
self.uid)
self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
"it must be MLlib Vector type.")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, scalingVec=None, inputCol=None, outputCol=None):
"""
setParams(self, scalingVec=None, inputCol=None, outputCol=None)
Sets params for this ElementwiseProduct.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

def setScalingVec(self, value):
"""
Sets the value of :py:attr:`scalingVec`.
"""
self._paramMap[self.scalingVec] = value
return self

def getScalingVec(self):
"""
Gets the value of scalingVec or its default value.
"""
return self.getOrDefault(self.scalingVec)


@inherit_doc
class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures):
"""
Expand Down