Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions dataset/src/main/scala/frameless/TypedDataset.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter}
import org.apache.spark.sql._
import shapeless._
import shapeless.ops.hlist.{Prepend, ToTraversable, Tupler}
import shapeless.ops.record.{Remover, Values}

/** [[TypedDataset]] is a safer interface for working with `Dataset`.
*
Expand Down Expand Up @@ -605,6 +606,51 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val
}
}

/**
* Returns a new Dataset as a tuple with the specified
* column dropped.
* Does not allow for dropping from a single column TypedDataset
*
* {{{
* val d: TypedDataset[Foo(a: String, b: Int...)] = ???
* val result = TypedDataset[(Int, ...)] = d.drop('a)
* }}}
* @param column column to drop specified as a Symbol
* @param genOfT LabelledGeneric derived for T
* @param remover Remover derived for TRep and column
* @param values values of T with column removed
* @param tupler tupler of values
* @param encoder evidence of encoder of the tupled values
* @tparam Out Tupled return type
* @tparam TRep shapeless' record representation of T
* @tparam Removed record of T with column removed
* @tparam ValuesFromRemoved values of T with column removed as an HList
* @tparam V value type of column in T
* @return
*/
def drop[
Out,
TRep <: HList,
Removed <: HList,
ValuesFromRemoved <: HList,
V
](
column: Witness.Lt[Symbol]
)(implicit
genOfT: LabelledGeneric.Aux[T, TRep],
remover: Remover.Aux[TRep, column.T, (V, Removed)],
values: Values.Aux[Removed, ValuesFromRemoved],
tupler: Tupler.Aux[ValuesFromRemoved, Out],
encoder: TypedEncoder[Out]
): TypedDataset[Out] = {
val dropped = dataset
.toDF()
.drop(column.value.name)
.as[Out](TypedExpressionEncoder[Out])

TypedDataset.create[Out](dropped)
}

/** Prepends a new column to the Dataset.
*
* {{{
Expand Down
69 changes: 69 additions & 0 deletions dataset/src/test/scala/frameless/DropTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class DropTest extends TypedDatasetSuite {
test("drop five columns") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you could add a test dropping the only the first column, and another one dropping only the last column

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I can make it more explicit by making separate tests but those two cases are already covered in this test by dropping 'a which is the first column and then _4 etc which are the last ones

def prop[A: TypedEncoder](value: A): Prop = {
val d5 = TypedDataset.create(X5(value, value, value, value, value) :: Nil)
val d4 = d5.drop('a) //drops first column
val d3 = d4.drop('_4) //drops last column
val d2 = d3.drop('_2) //drops middle column
val d1 = d2.drop('_2)

Tuple1(value) ?= d1.collect().run().head
}

check(prop[Int] _)
check(prop[Long] _)
check(prop[String] _)
check(prop[SQLDate] _)
check(prop[Option[X1[Boolean]]] _)
}

test("drop first column") {
def prop[A: TypedEncoder](value: A): Prop = {
val d3 = TypedDataset.create(X3(value, value, value) :: Nil)
val d2 = d3.drop('a)

(value, value) ?= d2.collect().run().head
}

check(prop[Int] _)
check(prop[Long] _)
check(prop[String] _)
check(prop[SQLDate] _)
check(prop[Option[X1[Boolean]]] _)
}

test("drop middle column") {
def prop[A: TypedEncoder](value: A): Prop = {
val d3 = TypedDataset.create(X3(value, value, value) :: Nil)
val d2 = d3.drop('b)

(value, value) ?= d2.collect().run().head
}

check(prop[Int] _)
check(prop[Long] _)
check(prop[String] _)
check(prop[SQLDate] _)
check(prop[Option[X1[Boolean]]] _)
}

test("drop last column") {
def prop[A: TypedEncoder](value: A): Prop = {
val d3 = TypedDataset.create(X3(value, value, value) :: Nil)
val d2 = d3.drop('c)

(value, value) ?= d2.collect().run().head
}

check(prop[Int] _)
check(prop[Long] _)
check(prop[String] _)
check(prop[SQLDate] _)
check(prop[Option[X1[Boolean]]] _)
}
}