Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the second column
* @return the covariance of the two columns.
*
* {{{
* import org.apache.spark.sql.functions._
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

had to change this to sc.parallelize(...).toDF(...) because sqlContext.createDataFrame(0 until 10) doesn't work because it needs to be a tuple I guess.

* .withColumn("rand2", rand(seed=27))
* df.stat.cov("rand1", "rand2")
* }}}
*
* @since 1.4.0
*/
def cov(col1: String, col2: String): Double = {
Expand All @@ -50,6 +57,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
* {{{
* import org.apache.spark.sql.functions._
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.corr("rand1", "rand2", "pearson")
* }}}
*
* @since 1.4.0
*/
def corr(col1: String, col2: String, method: String): Double = {
Expand All @@ -65,6 +79,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param col2 the name of the column to calculate the correlation against
* @return The Pearson Correlation Coefficient as a Double.
*
* {{{
* import org.apache.spark.sql.functions._
* val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10))
* .withColumn("rand2", rand(seed=27))
* df.stat.corr("rand1", "rand2")
* }}}
*
* @since 1.4.0
*/
def corr(col1: String, col2: String): Double = {
Expand All @@ -85,6 +106,13 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* of the DataFrame.
* @return A DataFrame containing for the contingency table.
*
* {{{
* val df = sqlContext.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2),
* (3, 3))).toDF("key", "value")
* val ct = df.stat.crosstab("key", "value")
* ct.show()
* }}}
*
* @since 1.4.0
*/
def crosstab(col1: String, col2: String): DataFrame = {
Expand All @@ -105,6 +133,18 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* than 1e-4.
* @return A Local DataFrame with the Array of frequent items for each column.
*
* {{{
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4)
* freqSingles.show()
* // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
* val pairDf = df.select(struct("a", "b").as("a-b"))
* val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1)
* freqPairs.show()
* }}}
*
*
* @since 1.4.0
*/
def freqItems(cols: Array[String], support: Double): DataFrame = {
Expand Down Expand Up @@ -140,6 +180,21 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
* @param cols the names of the columns to search frequent items in.
* @return A Local DataFrame with the Array of frequent items for each column.
*
* {{{
* val rows = Seq.tabulate(100) { i =>
* if (i % 2 == 0) (1, -1.0) else (i, i * -1.0)
* }
* val df = sqlContext.createDataFrame(rows).toDF("a", "b")
* // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns
* // "a" and "b"
* val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4)
* freqSingles.show()
* // find the pair of items with a frequency greater than 0.1 in columns "a" and "b"
* val pairDf = df.select(struct("a", "b").as("a-b"))
* val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1)
* freqPairs.show()
* }}}
*
* @since 1.4.0
*/
def freqItems(cols: Seq[String], support: Double): DataFrame = {
Expand Down