@@ -52,13 +52,39 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
5252 StatFunctions .calculateCov(df, Seq (col1, col2))
5353 }
5454
55+ /**
56+ * Calculate the sample covariance between columns of a DataFrame.
57+ *
58+ * @return a covariance matrix as a DataFrame
59+ *
60+ * {{{
61+ * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
62+ * .withColumn("rand2", rand(seed=27))
63+ * val covmatrix = df.stat.cov()
64+ * covmatrix.show()
65+ * +---------+------------------+-------------------+-------------------+
66+ * |FieldName| id| rand1| rand2|
67+ * +---------+------------------+-------------------+-------------------+
68+ * | id| 9.166666666666666| 0.4131594565676311| 0.7012982830955725|
69+ * | rand1|0.4131594565676311|0.11982701890603772|0.06500805072758595|
70+ * | rand2|0.7012982830955725|0.06500805072758595|0.09383550706974164|
71+ * +---------+------------------+-------------------+-------------------+
72+ * }}}
73+ *
74+ * @since 1.6.0
75+ */
76+ def cov (): DataFrame = {
77+ StatFunctions .calculateCov(df)
78+ }
79+
5580 /**
5681 * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
5782 * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
5883 * MLlib's Statistics.
5984 *
6085 * @param col1 the name of the column
6186 * @param col2 the name of the column to calculate the correlation against
87+ * @param method the name of the correlation method
6288 * @return The Pearson Correlation Coefficient as a Double.
6389 *
6490 * {{{
@@ -96,6 +122,63 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
96122 corr(col1, col2, " pearson" )
97123 }
98124
125+ /**
126+ * Calculates the correlation of columns in the DataFrame. Currently only supports the Pearson
127+ * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
128+ * MLlib's Statistics.
129+ *
130+ * @param method the name of the correlation method
131+ * @return The Pearson Correlation matrix as a DataFrame.
132+ *
133+ * {{{
134+ * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
135+ * .withColumn("rand2", rand(seed=27))
136+ * val corrmatrix = df.stat.corr()
137+ * corrmatrix.show()
138+ * +---------+------------------+------------------+------------------+
139+ * |FieldName| id| rand1| rand2|
140+ * +---------+------------------+------------------+------------------+
141+ * | id| 1.0| 0.3942163209095|0.7561595709319909|
142+ * | rand1| 0.3942163209095| 1.0|0.6130644931298477|
143+ * | rand2|0.7561595709319909|0.6130644931298477| 1.0|
144+ * +---------+------------------+------------------+------------------+
145+ * }}}
146+ *
147+ * @since 1.6.0
148+ */
149+ def corr (method : String ): DataFrame = {
150+ require(method == " pearson" , " Currently only the calculation of the Pearson Correlation " +
151+ " coefficient is supported." )
152+ StatFunctions .pearsonCorrelation(df)
153+ }
154+
155+ /**
156+ * Calculates the correlation of columns in the DataFrame. Currently only supports the Pearson
157+ * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in
158+ * MLlib's Statistics.
159+ *
160+ * @return The Pearson Correlation matrix as a DataFrame.
161+ *
162+ * {{{
163+ * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
164+ * .withColumn("rand2", rand(seed=27))
165+ * val corrmatrix = df.stat.corr()
166+ * corrmatrix.show()
167+ * +---------+------------------+------------------+------------------+
168+ * |FieldName| id| rand1| rand2|
169+ * +---------+------------------+------------------+------------------+
170+ * | id| 1.0| 0.3942163209095|0.7561595709319909|
171+ * | rand1| 0.3942163209095| 1.0|0.6130644931298477|
172+ * | rand2|0.7561595709319909|0.6130644931298477| 1.0|
173+ * +---------+------------------+------------------+------------------+
174+ * }}}
175+ *
176+ * @since 1.6.0
177+ */
178+ def corr (): DataFrame = {
179+ corr(" pearson" )
180+ }
181+
99182 /**
100183 * Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
101184 * The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
0 commit comments