-
Notifications
You must be signed in to change notification settings - Fork 13
Add support for a Flatten annotation #20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
36d5eb5
d94f27b
eeb2ffc
ec68115
6cc0220
b3e7555
e3f84b0
c3b66de
e55ea9d
291f9ae
b468719
3a230fc
924104d
b829140
62188a6
c230e97
df8bcb0
f9eb2e6
a771b36
f876e91
a783f15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,220 @@ | ||
| /** | ||
| * Copyright (c) 2017-2018, Benjamin Fradet, and other contributors. | ||
| * | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package ste | ||
|
|
||
| import org.apache.spark.sql.{ Column, DataFrame, Dataset, Encoder } | ||
| import org.apache.spark.sql.functions._ | ||
| import org.apache.spark.sql.types._ | ||
| import scala.annotation.tailrec | ||
| import scala.collection.generic.IsTraversableOnce | ||
| import scala.collection.breakOut | ||
| import shapeless._ | ||
| import shapeless.ops.hlist._ | ||
| import shapeless.syntax.std.tuple._ | ||
| import shapeless.labelled.FieldType | ||
|
|
||
| case class Prefix(p: String) { | ||
| def addSuffix(s: Any) = Prefix(s"$p.$s") | ||
| def getParent = Prefix(p.split("\\.").dropRight(1).mkString(".")) | ||
| def getSuffix = p.split("\\.").last | ||
| def isParentOf(other: Prefix) = other.toString.startsWith(s"$p.") | ||
| def isChildrenOf(other: Prefix) = other.isParentOf(this) | ||
| def quotedString = s"`$p`" | ||
| override def toString = p | ||
| } | ||
|
|
||
| @annotation.implicitNotFound(""" | ||
| Type ${A} does not have a DataTypeSelector defined in the library. | ||
| You need to define one yourself. | ||
| """) | ||
| sealed trait DataTypeSelector[A] { | ||
| import DataTypeSelector.Select | ||
|
|
||
| val select: Select | ||
| } | ||
|
|
||
| object DataTypeSelector { | ||
| type Prefixes = List[Prefix] | ||
| type Select = (DataFrame, Option[Prefixes]) => DataFrame | ||
|
|
||
| def pure[A](s: Select): DataTypeSelector[A] = | ||
| new DataTypeSelector[A] { | ||
| val select: Select = s | ||
| } | ||
|
|
||
| def identityDF[A]: DataTypeSelector[A] = | ||
| new DataTypeSelector[A] { | ||
| val select: Select = (df, _) => df | ||
| } | ||
| } | ||
|
|
||
| @annotation.implicitNotFound(""" | ||
| Type ${A} does not have a StructTypeSelector defined in the library. | ||
| You need to define one yourself. | ||
| """) | ||
| sealed trait StructTypeSelector[A] extends DataTypeSelector[A] { | ||
| import DataTypeSelector.Select | ||
|
|
||
| val select: Select | ||
| } | ||
|
|
||
| object StructTypeSelector extends SelectorImplicits { | ||
| import DataTypeSelector.Select | ||
|
|
||
| def apply[A](implicit s: StructTypeSelector[A]): StructTypeSelector[A] = s | ||
|
|
||
| def pure[A](s: Select): StructTypeSelector[A] = | ||
| new StructTypeSelector[A] { | ||
| val select: Select = s | ||
| } | ||
| } | ||
|
|
||
| @annotation.implicitNotFound(""" | ||
| Type ${A} does not have a AnnotatedStructTypeSelector defined in the library. | ||
| You need to define one yourself. | ||
| """) | ||
| sealed trait AnnotatedStructTypeSelector[A] { | ||
| import AnnotatedStructTypeSelector.Select | ||
|
|
||
| val select: Select | ||
| } | ||
|
|
||
| object AnnotatedStructTypeSelector extends SelectorImplicits { | ||
| import DataTypeSelector.Prefixes | ||
|
|
||
| type Select = (DataFrame, Option[Prefixes], Seq[Option[Flatten]]) => DataFrame | ||
|
|
||
| def pure[A](s: Select): AnnotatedStructTypeSelector[A] = | ||
| new AnnotatedStructTypeSelector[A] { | ||
| val select = s | ||
| } | ||
| } | ||
|
|
||
| trait SelectorImplicits { | ||
| implicit val hnilSelector: AnnotatedStructTypeSelector[HNil] = | ||
| AnnotatedStructTypeSelector.pure((df, _, _) => df) | ||
|
|
||
| implicit def hconsSelector[K <: Symbol, H, T <: HList]( | ||
| implicit | ||
| witness: Witness.Aux[K], | ||
| hSelector: Lazy[DataTypeSelector[H]], | ||
| tSelector: AnnotatedStructTypeSelector[T] | ||
| ): AnnotatedStructTypeSelector[FieldType[K, H] :: T] = AnnotatedStructTypeSelector.pure { (df, parentPrefixes, flatten) => | ||
| val fieldName = witness.value.name | ||
| val prefixes = parentPrefixes.map(_.map(_.addSuffix(fieldName))).getOrElse(List(Prefix(fieldName))) | ||
| val childPrefixes = getChildPrefixes(prefixes, flatten.head) | ||
| val dfHead = hSelector.value.select(df, Some(childPrefixes)) | ||
| val dfNested = flatten.head.map { fl => | ||
| val fields = dfHead.schema.fields.map(f => Prefix(f.name)).toList | ||
| val restCols = fields.filter(f => !childPrefixes.exists(_.isParentOf(f))).map(f => dfHead(f.quotedString)) | ||
| val structs = childPrefixes.map { p => | ||
| val cols = fields.filter(_.isChildrenOf(p)).map(f => dfHead(f.quotedString).as(f.getSuffix)) | ||
| struct(cols :_*).as(p.toString) | ||
| } | ||
| val dfStruct = dfHead.select((structs ++ restCols) :_*) | ||
| val nestedCols = getNestedColumns(childPrefixes, dfStruct, fl) | ||
| orderedSelect(dfStruct, nestedCols, fields) | ||
| }.getOrElse(dfHead) | ||
| tSelector.select(dfNested, parentPrefixes, flatten.tail) | ||
| } | ||
|
|
||
| private def getChildPrefixes(prefixes: List[Prefix], flatten: Option[Flatten]): List[Prefix] = | ||
| flatten.map { | ||
| case Flatten(times, _) if times > 1 => (0 until times).flatMap(i => prefixes.map(_.addSuffix(i))).toList | ||
| case Flatten(_, keys) if keys.nonEmpty => keys.flatMap(k => prefixes.map(_.addSuffix(k))).toList | ||
| case Flatten(_, _) => prefixes | ||
| }.getOrElse(prefixes) | ||
|
|
||
| private def getNestedColumns(prefixes: List[Prefix], df: DataFrame, flatten: Flatten): Map[Prefix, Column] = | ||
| prefixes.groupBy(_.getParent).map { case (prefix, groupedPrefixes) => | ||
| val colName = prefix.toString | ||
| val cols = groupedPrefixes.map(p => df(p.quotedString)) | ||
| flatten match { | ||
| case Flatten(times, _) if times > 1 => (prefix, array(cols :_*).as(colName)) | ||
| case Flatten(_, keys) if keys.nonEmpty => (prefix, map(interleave(keys.map(lit), cols) :_*).as(colName)) | ||
| case Flatten(_, _) => (groupedPrefixes.head, cols.head) | ||
| } | ||
| }(breakOut) | ||
|
|
||
| private def orderedSelect(df: DataFrame, nestedCols: Map[Prefix, Column], fields: List[Prefix]): DataFrame = { | ||
| @tailrec | ||
| def loop(nestedCols: Map[Prefix, Column], fields: List[Prefix], cols: List[Column]): List[Column] = fields match { | ||
| case Nil => cols.reverse | ||
| case hd +: tail => nestedCols.find { case (p, _) => p.isParentOf(hd) } match { | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since you have lists you can do
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea, unfortunately shapeless overrides the
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, too bad :( |
||
| case Some((p, c)) => loop(nestedCols - p, fields.dropWhile(_.isChildrenOf(p)), c +: cols) | ||
| case None => loop(nestedCols, tail, df(hd.quotedString) +: cols) | ||
| } | ||
| } | ||
| val cols = loop(nestedCols, fields, List[Column]()) | ||
| df.select(cols :_*) | ||
| } | ||
|
|
||
| private def interleave[T](a: Seq[T], b: Seq[T]): Seq[T] = a.zip(b).flatMap(_.toList) | ||
|
|
||
| implicit def dfSelector[A, H <: HList, HF <: HList]( | ||
| implicit | ||
| generic: LabelledGeneric.Aux[A, H], | ||
| flattenAnnotations: Annotations.Aux[Flatten, A, HF], | ||
| hSelector: Lazy[AnnotatedStructTypeSelector[H]], | ||
| flattenToList: ToList[HF, Option[Flatten]] | ||
| ): StructTypeSelector[A] = StructTypeSelector.pure { (df, prefixes) => | ||
| val flatten = flattenAnnotations().toList[Option[Flatten]] | ||
| hSelector.value.select(df, prefixes, flatten) | ||
| } | ||
|
|
||
| implicit val binarySelector: DataTypeSelector[Array[Byte]] = DataTypeSelector.identityDF | ||
| implicit val booleanSelector: DataTypeSelector[Boolean] = DataTypeSelector.identityDF | ||
| implicit val byteSelector: DataTypeSelector[Byte] = DataTypeSelector.identityDF | ||
| implicit val dateSelector: DataTypeSelector[java.sql.Date] = DataTypeSelector.identityDF | ||
| implicit val decimalSelector: DataTypeSelector[BigDecimal] = DataTypeSelector.identityDF | ||
| implicit val doubleSelector: DataTypeSelector[Double] = DataTypeSelector.identityDF | ||
| implicit val floatSelector: DataTypeSelector[Float] = DataTypeSelector.identityDF | ||
| implicit val intSelector: DataTypeSelector[Int] = DataTypeSelector.identityDF | ||
| implicit val longSelector: DataTypeSelector[Long] = DataTypeSelector.identityDF | ||
| implicit val nullSelector: DataTypeSelector[Unit] = DataTypeSelector.identityDF | ||
| implicit val shortSelector: DataTypeSelector[Short] = DataTypeSelector.identityDF | ||
| implicit val stringSelector: DataTypeSelector[String] = DataTypeSelector.identityDF | ||
| implicit val timestampSelector: DataTypeSelector[java.sql.Timestamp] = DataTypeSelector.identityDF | ||
| implicit def optionSelector[T]: DataTypeSelector[Option[T]] = DataTypeSelector.identityDF | ||
|
|
||
| implicit def traversableOnceSelector[A0, C[_]]( | ||
| implicit | ||
| s: DataTypeSelector[A0], | ||
| is: IsTraversableOnce[C[A0]] { type A = A0 } | ||
| ): DataTypeSelector[C[A0]] = DataTypeSelector.pure { (df, prefixes) => | ||
| s.select(df, prefixes) | ||
| } | ||
|
|
||
| implicit def mapSelector[K, V]( | ||
| implicit s: DataTypeSelector[V] | ||
| ): DataTypeSelector[Map[K, V]] = DataTypeSelector.pure { (df, prefixes) => | ||
| s.select(df, prefixes) | ||
| } | ||
| } | ||
|
|
||
| object DFUtils { | ||
| implicit class FlattenedDataFrame(df: DataFrame) { | ||
| def asNested[A : Encoder : StructTypeSelector]: Dataset[A] = selectNested.as[A] | ||
|
|
||
| def selectNested[A](implicit s: StructTypeSelector[A]): DataFrame = s.select(df, None) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could we add the license header in this file and the associated spec?