Skip to content

Commit

Permalink
feat: UNIC-705 add HashTransformation traits (#232)
Browse files Browse the repository at this point in the history
  • Loading branch information
zoemcl authored Jul 15, 2024
1 parent e707111 commit 4a1b6b2
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
package bio.ferlab.datalake.spark3.transformation

import org.apache.spark.sql.Column
import org.apache.spark.sql.{Column, DataFrame}
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.types.StringType

trait HashTransformation[A] extends Transformation {self =>
sealed trait HashTransformation[A] extends Transformation {self =>
val columns: A
val nullValues: Column = lit(null).cast(StringType)
}

object HashTransformation {
trait SimpleHashTransformation extends HashTransformation[Seq[String]]
trait DynamicHashTransformation extends HashTransformation[DataFrame => Seq[String]]
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package bio.ferlab.datalake.spark3.transformation

import bio.ferlab.datalake.spark3.transformation.HashTransformation.SimpleHashTransformation
import bio.ferlab.datalake.spark3.transformation.PBKDF2.pbkdf2Udf
import com.roundeights.hasher.Implicits._
import org.apache.spark.sql._
Expand All @@ -17,7 +18,7 @@ import scala.language.postfixOps
* @param keyLength length of the resulting hash
* @param columns names of the columns to hash
*/
case class PBKDF2(salt: String, iteration: Int, keyLength: Int, override val columns: String*) extends HashTransformation[Seq[String]] {
case class PBKDF2(salt: String, iteration: Int, keyLength: Int, override val columns: String*) extends SimpleHashTransformation {

override def transform: DataFrame => DataFrame = { df =>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package bio.ferlab.datalake.spark3.transformation

import bio.ferlab.datalake.spark3.transformation.HashTransformation.SimpleHashTransformation
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StringType

case class SHA1(salt: String, override val columns: String*) extends HashTransformation[Seq[String]] {
case class SHA1(salt: String, override val columns: String*) extends SimpleHashTransformation {
override def transform: DataFrame => DataFrame = { df =>
columns.foldLeft(df){ case (d, column) =>
d.withColumn(column,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package bio.ferlab.datalake.spark3.transformation

import bio.ferlab.datalake.spark3.transformation.HashTransformation.DynamicHashTransformation
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, concat_ws, lit, sha1, when}
import org.apache.spark.sql.types.StringType

case class SHA1Dynamic(salt: String, override val columns: DataFrame => Seq[String]) extends HashTransformation[DataFrame => Seq[String]] {
case class SHA1Dynamic(salt: String, override val columns: DataFrame => Seq[String]) extends DynamicHashTransformation {

override def transform: DataFrame => DataFrame = { df =>
columns(df).foldLeft(df){ case (d, column) =>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package bio.ferlab.datalake.spark3.transformation

import bio.ferlab.datalake.spark3.transformation.HashTransformation.SimpleHashTransformation
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StringType

case class SHA256(salt: String, override val columns: String*) extends HashTransformation[Seq[String]] {
case class SHA256(salt: String, override val columns: String*) extends SimpleHashTransformation {
override def transform: DataFrame => DataFrame = { df =>
columns.foldLeft(df){ case (d, column) =>
d.withColumn(column,
Expand Down

0 comments on commit 4a1b6b2

Please sign in to comment.