diff --git a/dataset/src/main/scala/frameless/TypedDataset.scala b/dataset/src/main/scala/frameless/TypedDataset.scala index 18406f926..7dc92ee6d 100644 --- a/dataset/src/main/scala/frameless/TypedDataset.scala +++ b/dataset/src/main/scala/frameless/TypedDataset.scala @@ -9,6 +9,7 @@ import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter} import org.apache.spark.sql._ import shapeless._ import shapeless.ops.hlist.{Prepend, ToTraversable, Tupler} +import shapeless.ops.record.{Remover, Values} /** [[TypedDataset]] is a safer interface for working with `Dataset`. * @@ -605,6 +606,51 @@ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val } } + /** + * Returns a new Dataset as a tuple with the specified + * column dropped. + * Does not allow for dropping from a single column TypedDataset + * + * {{{ + * val d: TypedDataset[Foo(a: String, b: Int...)] = ??? + * val result = TypedDataset[(Int, ...)] = d.drop('a) + * }}} + * @param column column to drop specified as a Symbol + * @param genOfT LabelledGeneric derived for T + * @param remover Remover derived for TRep and column + * @param values values of T with column removed + * @param tupler tupler of values + * @param encoder evidence of encoder of the tupled values + * @tparam Out Tupled return type + * @tparam TRep shapeless' record representation of T + * @tparam Removed record of T with column removed + * @tparam ValuesFromRemoved values of T with column removed as an HList + * @tparam V value type of column in T + * @return + */ + def drop[ + Out, + TRep <: HList, + Removed <: HList, + ValuesFromRemoved <: HList, + V + ]( + column: Witness.Lt[Symbol] + )(implicit + genOfT: LabelledGeneric.Aux[T, TRep], + remover: Remover.Aux[TRep, column.T, (V, Removed)], + values: Values.Aux[Removed, ValuesFromRemoved], + tupler: Tupler.Aux[ValuesFromRemoved, Out], + encoder: TypedEncoder[Out] + ): TypedDataset[Out] = { + val dropped = dataset + .toDF() + .drop(column.value.name) + .as[Out](TypedExpressionEncoder[Out]) + + TypedDataset.create[Out](dropped) + } + /** Prepends a new column to the Dataset. * * {{{ diff --git a/dataset/src/test/scala/frameless/DropTest.scala b/dataset/src/test/scala/frameless/DropTest.scala new file mode 100644 index 000000000..5c030b273 --- /dev/null +++ b/dataset/src/test/scala/frameless/DropTest.scala @@ -0,0 +1,69 @@ +package frameless + +import org.scalacheck.Prop +import org.scalacheck.Prop._ + +class DropTest extends TypedDatasetSuite { + test("drop five columns") { + def prop[A: TypedEncoder](value: A): Prop = { + val d5 = TypedDataset.create(X5(value, value, value, value, value) :: Nil) + val d4 = d5.drop('a) //drops first column + val d3 = d4.drop('_4) //drops last column + val d2 = d3.drop('_2) //drops middle column + val d1 = d2.drop('_2) + + Tuple1(value) ?= d1.collect().run().head + } + + check(prop[Int] _) + check(prop[Long] _) + check(prop[String] _) + check(prop[SQLDate] _) + check(prop[Option[X1[Boolean]]] _) + } + + test("drop first column") { + def prop[A: TypedEncoder](value: A): Prop = { + val d3 = TypedDataset.create(X3(value, value, value) :: Nil) + val d2 = d3.drop('a) + + (value, value) ?= d2.collect().run().head + } + + check(prop[Int] _) + check(prop[Long] _) + check(prop[String] _) + check(prop[SQLDate] _) + check(prop[Option[X1[Boolean]]] _) + } + + test("drop middle column") { + def prop[A: TypedEncoder](value: A): Prop = { + val d3 = TypedDataset.create(X3(value, value, value) :: Nil) + val d2 = d3.drop('b) + + (value, value) ?= d2.collect().run().head + } + + check(prop[Int] _) + check(prop[Long] _) + check(prop[String] _) + check(prop[SQLDate] _) + check(prop[Option[X1[Boolean]]] _) + } + + test("drop last column") { + def prop[A: TypedEncoder](value: A): Prop = { + val d3 = TypedDataset.create(X3(value, value, value) :: Nil) + val d2 = d3.drop('c) + + (value, value) ?= d2.collect().run().head + } + + check(prop[Int] _) + check(prop[Long] _) + check(prop[String] _) + check(prop[SQLDate] _) + check(prop[Option[X1[Boolean]]] _) + } +}