diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala
index b369932e4..58fe2d223 100644
--- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala
+++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala
@@ -16,7 +16,7 @@
package za.co.absa.enceladus.examples
import org.apache.spark.sql.{DataFrame, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials
@@ -41,7 +41,7 @@ object CustomRuleSample1 {
// scalastyle:off magic.number
val menasBaseUrls = List("http://localhost:8080/menas")
val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example")
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala
index 6eef8ed86..de79ace35 100644
--- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala
+++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala
@@ -17,7 +17,7 @@ package za.co.absa.enceladus.examples
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{DataFrame, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials
@@ -43,7 +43,7 @@ object CustomRuleSample2 {
val conf = ConfigFactory.load()
val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example")
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala
index 45ef541bf..932fa9fac 100644
--- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala
+++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala
@@ -17,7 +17,7 @@ package za.co.absa.enceladus.examples
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{DataFrame, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials
@@ -38,7 +38,7 @@ object CustomRuleSample3 {
val conf = ConfigFactory.load()
val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example")
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala
index d88607cb8..fcae9619e 100644
--- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala
+++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala
@@ -19,7 +19,7 @@ import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.functions.{col, concat, concat_ws, lit}
import org.apache.spark.sql.{DataFrame, DataFrameReader, SparkSession}
import scopt.OptionParser
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials
@@ -142,7 +142,7 @@ object CustomRuleSample4 {
val conf = ConfigFactory.load()
val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example")
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas)
val dfReader: DataFrameReader = {
diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala
index 934a1414f..9c4b41a8c 100644
--- a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala
+++ b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala
@@ -16,7 +16,7 @@ package za.co.absa.enceladus.examples.interpreter.rules.custom
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Row, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.conformance.interpreter.rules.RuleInterpreter
import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule
@@ -40,7 +40,7 @@ case class UppercaseCustomRuleInterpreter(rule: UppercaseCustomConformanceRule)
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
handleArrays(rule.outputColumn, df) { flattened =>
// we have to do this if this rule is to support arrays
diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala
index 7fe746606..c2b061076 100644
--- a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala
+++ b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala
@@ -16,7 +16,7 @@ package za.co.absa.enceladus.examples.interpreter.rules.custom
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, Dataset, Row, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.conformance.interpreter.rules.RuleInterpreter
import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule
@@ -40,7 +40,7 @@ case class StringFuncInterpreter(rule: ColumnFunctionCustomConformanceRule) exte
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
handleArrays(rule.outputColumn, df) { flattened =>
// we have to do this if this rule is to support arrays
diff --git a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala
index de836828c..fb0202ad2 100644
--- a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala
+++ b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql
import org.apache.spark.sql.DataFrame
import org.scalatest.FunSuite
import org.scalatest.mockito.MockitoSugar
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
@@ -35,7 +35,7 @@ object TestOutputRow {
class UppercaseCustomConformanceRuleSuite extends FunSuite with SparkTestBase with MockitoSugar {
import spark.implicits._
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = mock[MenasDAO] // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
diff --git a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala
index 16c71401e..0716dd4a4 100644
--- a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala
+++ b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala
@@ -20,7 +20,7 @@ import org.apache.spark.sql
import org.apache.spark.sql.DataFrame
import org.scalatest.FunSuite
import org.scalatest.mockito.MockitoSugar
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials
@@ -37,7 +37,7 @@ object XPadTestOutputRow {
class LpadCustomConformanceRuleSuite extends FunSuite with SparkTestBase with MockitoSugar {
import spark.implicits._
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = mock[MenasDAO] // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
@@ -185,7 +185,7 @@ class RpadCustomConformanceRuleSuite extends FunSuite with SparkTestBase {
private val conf = ConfigFactory.load()
private val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
private val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/test/resources/user.keytab.example")
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
diff --git a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala
index d79331e98..449bb5d42 100644
--- a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala
+++ b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala
@@ -18,7 +18,7 @@ package za.co.absa.enceladus.plugins.builtin.errorsender.mq
import org.apache.log4j.LogManager
import org.apache.spark.sql.functions.{col, explode, lit, size, struct}
import org.apache.spark.sql.types.DataTypes
-import org.apache.spark.sql.{DataFrame, Encoders}
+import org.apache.spark.sql.{DataFrame, Encoder, Encoders}
import za.co.absa.enceladus.plugins.api.postprocessor.PostProcessor
import za.co.absa.enceladus.plugins.builtin.common.mq.kafka.KafkaConnectionParams
import za.co.absa.enceladus.plugins.builtin.errorsender.DceError
@@ -27,8 +27,8 @@ import za.co.absa.enceladus.utils.schema.SchemaUtils
import KafkaErrorSenderPluginImpl._
import za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin
import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams
-import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId
import za.co.absa.enceladus.utils.error.ErrorMessage.ErrorCodes
+import za.co.absa.enceladus.utils.modules._
import scala.util.{Failure, Success, Try}
@@ -87,8 +87,8 @@ case class KafkaErrorSenderPluginImpl(connectionParams: KafkaConnectionParams,
* @return DF with exploded errors and corresponding to the given error source
*/
def getIndividualErrors(dataFrame: DataFrame, params: ErrorSenderPluginParams): DataFrame = {
- implicit val singleErrorStardardizedEncoder = Encoders.product[SingleErrorStardardized]
- implicit val dceErrorEncoder = Encoders.product[DceError]
+ implicit val singleErrorStardardizedEncoder: Encoder[SingleErrorStardardized] = Encoders.product[SingleErrorStardardized]
+ implicit val dceErrorEncoder: Encoder[DceError] = Encoders.product[DceError]
val allowedErrorCodes = KafkaErrorSenderPluginImpl.errorCodesForSource(params.sourceId)
@@ -168,7 +168,7 @@ object KafkaErrorSenderPluginImpl {
informationDate = Some(reportDate.toLocalDate.toEpochDay.toInt),
outputFileName = Some(additionalParams.outputPath),
recordId = recordId,
- errorSourceId = additionalParams.sourceId.toString,
+ errorSourceId = additionalParams.sourceId.value,
errorType = singleError.errType,
errorCode = singleError.errCode,
errorDescription = singleError.errMsg,
@@ -184,9 +184,9 @@ object KafkaErrorSenderPluginImpl {
}
}
- def errorCodesForSource(sourceId: ErrorSourceId.Value): Seq[String] = sourceId match {
- case ErrorSourceId.Standardization => ErrorCodes.standardizationErrorCodes
- case ErrorSourceId.Conformance => ErrorCodes.conformanceErrorCodes
+ def errorCodesForSource(sourceId: SourcePhase): Seq[String] = sourceId match {
+ case SourcePhase.Standardization => ErrorCodes.standardizationErrorCodes
+ case SourcePhase.Conformance => ErrorCodes.conformanceErrorCodes
}
}
diff --git a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala
index 6e9df9309..9c8deb84b 100644
--- a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala
+++ b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala
@@ -17,14 +17,14 @@ package za.co.absa.enceladus.plugins.builtin.errorsender.params
import java.time.Instant
-import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId
+import za.co.absa.enceladus.utils.modules.SourcePhase
case class ErrorSenderPluginParams(datasetName: String,
datasetVersion: Int,
reportDate: String,
reportVersion: Int,
outputPath: String,
- sourceId: ErrorSourceId.Value,
+ sourceId: SourcePhase,
sourceSystem: String,
runUrls: Option[String],
runId: Option[Int],
@@ -37,11 +37,6 @@ case class ErrorSenderPluginParams(datasetName: String,
object ErrorSenderPluginParams {
- object ErrorSourceId extends Enumeration {
- val Standardization = Value("standardizaton")
- val Conformance = Value("conformance")
- }
-
object FieldNames {
val datasetName = "datasetName"
val datasetVersion = "datasetVersion"
@@ -65,7 +60,7 @@ object ErrorSenderPluginParams {
reportDate -> params.reportDate,
reportVersion -> params.reportVersion.toString,
outputPath -> params.outputPath,
- sourceId -> params.sourceId.toString,
+ sourceId -> params.sourceId.asIdentifier,
sourceSystem -> params.sourceSystem,
processingTimestamp -> params.processingTimestamp.toString
) ++
@@ -80,7 +75,7 @@ object ErrorSenderPluginParams {
reportDate = params(reportDate),
reportVersion = params(reportVersion).toInt,
outputPath = params(outputPath),
- sourceId = ErrorSourceId.withName(params(sourceId)),
+ sourceId = SourcePhase.withIdentifier(params(sourceId)),
sourceSystem = params(sourceSystem),
runUrls = params.get(runUrls),
runId = params.get(runId).map(_.toInt),
diff --git a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala
index b1b01aa37..de95d10e4 100644
--- a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala
+++ b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala
@@ -29,7 +29,7 @@ import za.co.absa.enceladus.plugins.builtin.errorsender.DceError
import za.co.absa.enceladus.plugins.builtin.errorsender.mq.KafkaErrorSenderPluginSuite.{TestingErrCol, TestingRecord}
import za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin
import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams
-import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId
+import za.co.absa.enceladus.utils.modules.SourcePhase
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
@@ -65,17 +65,17 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match
import spark.implicits._
- val testDataDf = testData.toDF
- val testNow = Instant.now()
+ private val testDataDf = testData.toDF
+ private val testNow = Instant.now()
- val defaultPluginParams = ErrorSenderPluginParams(
+ private val defaultPluginParams = ErrorSenderPluginParams(
"datasetName1", datasetVersion = 1, "2020-03-30", reportVersion = 1, "output/Path1", null,
"sourceSystem1", Some("http://runUrls1"), runId = Some(1), Some("uniqueRunId"), testNow)
"ErrorSenderPluginParams" should "getIndividualErrors (exploding, filtering by source for Standardization)" in {
val plugin = KafkaErrorSenderPluginImpl(null, Map(), Map())
- plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = ErrorSourceId.Standardization))
+ plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = SourcePhase.Standardization))
.as[DceError].collect.map(entry => (entry.errorType, entry.errorCode)) should contain theSameElementsAs Seq(
("stdCastError", "E00000"),
("stdNullError", "E00002"),
@@ -87,7 +87,7 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match
it should "getIndividualErrors (exploding, filtering by source for Conformance)" in {
val plugin = KafkaErrorSenderPluginImpl(null, Map(), Map())
- plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = ErrorSourceId.Conformance))
+ plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = SourcePhase.Conformance))
.as[DceError].collect.map(entry => (entry.errorType, entry.errorCode)) should contain theSameElementsAs Seq(
("confMapError", "E00001"),
("confCastError", "E00003"),
@@ -101,7 +101,7 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match
val testKafkaUrl = "http://example.com:9092"
val testSchemaRegUrl = "http://example.com:8081"
- val testConfig = ConfigFactory.empty()
+ private val testConfig = ConfigFactory.empty()
.withValue("kafka.error.client.id", ConfigValueFactory.fromAnyRef(testClientId))
.withValue("kafka.error.topic.name", ConfigValueFactory.fromAnyRef(testTopicName))
.withValue("kafka.bootstrap.servers", ConfigValueFactory.fromAnyRef(testKafkaUrl))
@@ -143,7 +143,7 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match
// onlyConformanceErrorsDataDf should result in 0 std errors
val onlyConformanceErrorsDataDf = Seq(testData(1)).toDF
- errorKafkaPlugin.onDataReady(onlyConformanceErrorsDataDf, defaultPluginParams.copy(sourceId = ErrorSourceId.Standardization).toMap)
+ errorKafkaPlugin.onDataReady(onlyConformanceErrorsDataDf, defaultPluginParams.copy(sourceId = SourcePhase.Standardization).toMap)
assert(sendErrorsToKafkaWasCalled == false, "KafkaErrorSenderPluginImpl.sentErrorToKafka should not be called for 0 errors")
}
@@ -160,11 +160,11 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match
}
Seq(
- ErrorSourceId.Standardization -> Seq(
+ SourcePhase.Standardization -> Seq(
"standardizaton,stdCastError,E00000,Standardization Error - Type cast",
"standardizaton,stdNullError,E00002,Standardization Error - Null detected in non-nullable attribute"
),
- ErrorSourceId.Conformance -> Seq(
+ SourcePhase.Conformance -> Seq(
"conformance,confNegErr,E00004,Conformance Negation Error",
"conformance,confLitErr,E00005,Conformance Literal Error"
)
diff --git a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala
index 0c4513bd7..07c03eb6f 100644
--- a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala
+++ b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala
@@ -18,17 +18,17 @@ package za.co.absa.enceladus.plugins.builtin.errorsender.params
import java.time.Instant
import org.scalatest.{FlatSpec, Matchers}
-import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId
+import za.co.absa.enceladus.utils.modules.SourcePhase
class ErrorSenderPluginParamsSuite extends FlatSpec with Matchers {
- val params = ErrorSenderPluginParams(
+ private val params = ErrorSenderPluginParams(
datasetName = "datasetName1",
datasetVersion = 1,
reportDate = "2020-03-30",
reportVersion = 1,
outputPath = "output/Path1",
- sourceId = ErrorSourceId.Conformance,
+ sourceId = SourcePhase.Conformance,
sourceSystem = "sourceSystem1",
runUrls = Some("http://runUrls1"),
runId = Some(1),
diff --git a/pom.xml b/pom.xml
index 0b7997f48..50bd4e562 100644
--- a/pom.xml
+++ b/pom.xml
@@ -159,7 +159,7 @@
3.6.4
2.10.0
0.5.0
- 3.7.0
+ 4.0.0-RC2
0-10
3.1.1
2.0.0.RELEASE
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/CommonJobExecution.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/CommonJobExecution.scala
new file mode 100644
index 000000000..38d9eebac
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/CommonJobExecution.scala
@@ -0,0 +1,225 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.common
+
+import java.text.MessageFormat
+import java.time.Instant
+
+import com.typesafe.config.{Config, ConfigFactory}
+import org.apache.spark.SPARK_VERSION
+import org.apache.spark.sql.SparkSession
+import org.slf4j.{Logger, LoggerFactory}
+import za.co.absa.atum.AtumImplicits
+import za.co.absa.atum.core.Atum
+import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig}
+import za.co.absa.enceladus.common.plugin.PostProcessingService
+import za.co.absa.enceladus.common.plugin.menas.{MenasPlugin, MenasRunUrl}
+import za.co.absa.enceladus.common.version.SparkVersionGuard
+import za.co.absa.enceladus.dao.MenasDAO
+import za.co.absa.enceladus.dao.rest.MenasConnectionStringParser
+import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams
+import za.co.absa.enceladus.utils.config.SecureConfig
+import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils
+import za.co.absa.enceladus.utils.general.ProjectMetadataTools
+import za.co.absa.enceladus.utils.modules.SourcePhase
+import za.co.absa.enceladus.utils.performance.PerformanceMeasurer
+import za.co.absa.enceladus.utils.time.TimeZoneNormalizer
+
+import scala.util.control.NonFatal
+import scala.util.{Failure, Success, Try}
+
+trait CommonJobExecution {
+
+ protected case class PreparationResult(
+ dataset: Dataset,
+ reportVersion: Int,
+ pathCfg: PathConfig,
+ performance: PerformanceMeasurer
+ )
+
+ TimeZoneNormalizer.normalizeJVMTimeZone()
+ SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION)
+
+ protected val log: Logger = LoggerFactory.getLogger(this.getClass)
+ protected val conf: Config = ConfigFactory.load()
+ protected val menasBaseUrls: List[String] = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
+
+ protected def obtainSparkSession[T]()(implicit cmd: JobConfigParser[T]): SparkSession = {
+ val enceladusVersion = ProjectMetadataTools.getEnceladusVersion
+ log.info(s"Enceladus version $enceladusVersion")
+ val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("")
+ val spark = SparkSession.builder()
+ .appName(s"Standardisation $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion")
+ .getOrCreate()
+ TimeZoneNormalizer.normalizeSessionTimeZone(spark)
+ spark
+ }
+
+ protected def initialValidation(): Unit = {
+ // This should be the first thing the app does to make secure Kafka work with our CA.
+ // After Spring activates JavaX, it will be too late.
+ SecureConfig.setSecureKafkaProperties(conf)
+ }
+
+ protected def prepareJob[T]()
+ (implicit dao: MenasDAO,
+ cmd: JobConfigParser[T],
+ fsUtils: FileSystemVersionUtils,
+ spark: SparkSession): PreparationResult = {
+ dao.authenticate()
+ val dataset = dao.getDataset(cmd.datasetName, cmd.datasetVersion)
+ val reportVersion = getReportVersion(cmd, dataset)
+ val pathCfg = getPathCfg(cmd, dataset, reportVersion)
+
+ log.info(s"input path: ${pathCfg.inputPath}")
+ log.info(s"output path: ${pathCfg.outputPath}")
+ // die if the output path exists
+ validateForExistingOutputPath(fsUtils, pathCfg)
+
+ val performance = initPerformanceMeasurer(pathCfg.inputPath)
+
+ // Enable Spline
+ import za.co.absa.spline.core.SparkLineageInitializer._
+ spark.enableLineageTracking()
+
+ // Enable non-default persistence storage level if provided in the command line
+ cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel)
+
+ PreparationResult(dataset, reportVersion, pathCfg, performance)
+ }
+
+ protected def runPostProcessing[T](sourceId: SourcePhase, preparationResult: PreparationResult, jobCmdConfig: JobConfigParser[T])
+ (implicit spark: SparkSession, fileSystemVersionUtils: FileSystemVersionUtils): Unit = {
+ val df = spark.read.parquet(preparationResult.pathCfg.outputPath)
+ val runId = MenasPlugin.runNumber
+
+ if (runId.isEmpty) {
+ log.warn("No run number found, the Run URL cannot be properly reported!")
+ }
+
+ // reporting the UI url(s) - if more than one, its comma-separated
+ val runUrl: Option[String] = runId.map { runNumber =>
+ menasBaseUrls.map { menasBaseUrl =>
+ MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, jobCmdConfig.datasetName, jobCmdConfig.datasetVersion, runNumber)
+ }.mkString(",")
+ }
+
+ val sourceSystem = Atum.getControlMeasure.metadata.sourceApplication
+ val uniqueRunId = Atum.getControlMeasure.runUniqueId
+
+ val params = ErrorSenderPluginParams(jobCmdConfig.datasetName,
+ jobCmdConfig.datasetVersion, jobCmdConfig.reportDate, preparationResult.reportVersion, preparationResult.pathCfg.outputPath,
+ sourceId, sourceSystem, runUrl, runId, uniqueRunId, Instant.now)
+ val postProcessingService = PostProcessingService(conf, params)
+ postProcessingService.onSaveOutput(df)
+ }
+
+ protected def finishJob[T](jobConfig: JobConfigParser[T]): Unit = {
+ val name = jobConfig.datasetName
+ val version = jobConfig.datasetVersion
+ MenasPlugin.runNumber.foreach { runNumber =>
+ menasBaseUrls.foreach { menasBaseUrl =>
+ val apiUrl = MenasRunUrl.getMenasApiRunUrl(menasBaseUrl, name, version, runNumber)
+ val uiUrl = MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, name, version, runNumber)
+
+ log.info(s"Menas API Run URL: $apiUrl")
+ log.info(s"Menas UI Run URL: $uiUrl")
+ }
+ }
+ }
+
+
+ protected def getPathCfg[T](cmd: JobConfigParser[T], dataset: Dataset, reportVetsion: Int): PathConfig
+
+ protected def getStandardizationPath[T](jobConfig: JobConfigParser[T], reportVersion: Int): String = {
+ MessageFormat.format(conf.getString("standardized.hdfs.path"),
+ jobConfig.datasetName,
+ jobConfig.datasetVersion.toString,
+ jobConfig.reportDate,
+ reportVersion.toString)
+ }
+
+ protected def handleControlInfoValidation(): Unit = {
+ ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match {
+ case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) =>
+ val confEntry = "control.info.validation"
+ conf.getString(confEntry) match {
+ case "strict" => throw ex
+ case "warning" => log.warn(ex.msg)
+ case "none" =>
+ case _ => throw new RuntimeException(s"Invalid $confEntry value")
+ }
+ case Failure(ex) => throw ex
+ case Success(_) =>
+ }
+ }
+
+ protected def validateForExistingOutputPath(fsUtils: FileSystemVersionUtils, pathCfg: PathConfig): Unit = {
+ if (fsUtils.hdfsExists(pathCfg.outputPath)) {
+ throw new IllegalStateException(
+ s"Path ${pathCfg.outputPath} already exists. Increment the run version, or delete ${pathCfg.outputPath}"
+ )
+ }
+ }
+
+ protected def writePerformanceMetrics[T](performance: PerformanceMeasurer, jobCmdConfig: JobConfigParser[T]): Unit = {
+ jobCmdConfig.performanceMetricsFile.foreach(fileName => try {
+ performance.writeMetricsToFile(fileName)
+ } catch {
+ case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}")
+ })
+ }
+
+ protected def handleEmptyOutput(job: SourcePhase)(implicit spark: SparkSession): Unit = {
+ import za.co.absa.atum.core.Constants._
+
+ val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints
+ .flatMap(checkpoint =>
+ checkpoint.controls.filter(control =>
+ control.controlName.equalsIgnoreCase(controlTypeRecordCount)))
+ .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D))
+
+ if (areCountMeasurementsAllZero) {
+ log.warn(s"Empty output after running $job. Previous checkpoints show this is correct.")
+ } else {
+ val errMsg = s"Empty output after running $job, while previous checkpoints show non zero record count"
+ AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(job.toString, errMsg, "")
+ throw new IllegalStateException(errMsg)
+ }
+ }
+
+ private def getReportVersion[T](jobConfig: JobConfigParser[T], dataset: Dataset)(implicit fsUtils: FileSystemVersionUtils): Int = {
+ jobConfig.reportVersion match {
+ case Some(version) => version
+ case None =>
+ val newVersion = fsUtils.getLatestVersion(dataset.hdfsPublishPath, jobConfig.reportDate) + 1
+ log.warn(s"Report version not provided, inferred report version: $newVersion")
+ log.warn("This is an EXPERIMENTAL feature.")
+ log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.")
+ log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.")
+ newVersion
+ }
+ }
+
+ private def initPerformanceMeasurer(path: String)
+ (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils): PerformanceMeasurer = {
+ val performance = new PerformanceMeasurer(spark.sparkContext.appName)
+ val stdDirSize = fsUtils.getDirectorySize(path)
+ performance.startMeasurement(stdDirSize)
+ performance
+ }
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/ConfigError.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/ConfigError.scala
new file mode 100644
index 000000000..3a8570d3e
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/ConfigError.scala
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.common.config
+
+case class ConfigError(message: String) extends Exception(message)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/JobConfigParser.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/JobConfigParser.scala
new file mode 100644
index 000000000..03b217588
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/JobConfigParser.scala
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.common.config
+
+import org.apache.spark.storage.StorageLevel
+import scopt.OParser
+import za.co.absa.enceladus.dao.auth.{InvalidMenasCredentialsFactory, MenasCredentialsFactory, MenasKerberosCredentialsFactory, MenasPlainCredentialsFactory}
+
+import scala.util.matching.Regex
+
+
+trait JobConfigParser[R] {
+ def withDatasetName(value: String): R
+ def withDatasetVersion(value: Int): R
+ def withReportDate(value: String): R
+ def withReportVersion(value: Option[Int]): R
+ def withPerformanceMetricsFile(value: Option[String]): R
+ def withFolderPrefix(value: Option[String]): R
+ def withCredsFile(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): R
+ def withAuthKeytab(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): R
+ def withPersistStorageLevel(value: Option[StorageLevel]): R
+
+ def datasetName: String
+ def reportDate: String
+ def menasCredentialsFactory: MenasCredentialsFactory
+ def datasetVersion: Int
+ def reportVersion: Option[Int]
+ def performanceMetricsFile: Option[String]
+ def folderPrefix: Option[String]
+ def persistStorageLevel: Option[StorageLevel]
+ def credsFile: Option[String]
+ def keytabFile: Option[String]
+}
+
+object JobConfigParser {
+
+ //scalastyle:off method.length the length is legit for parsing input paramters
+ def jobConfigParser[R <: JobConfigParser[R]]: OParser[_, R] = {
+ val builder = OParser.builder[R]
+ import builder._
+ OParser.sequence(head("Job Parameters"),
+ opt[String]('D', "dataset-name").required().action((value, config) =>
+ config.withDatasetName(value)).text("Dataset name"),
+
+ opt[Int]('d', "dataset-version").required().action((value, config) =>
+ config.withDatasetVersion(value)).text("Dataset version")
+ .validate(value =>
+ if (value > 0) {
+ success
+ } else {
+ failure("Option --dataset-version must be > 0")
+ }),
+
+ opt[String]('R', "report-date").required().action((value, config) =>
+ config.withReportDate(value)).text("Report date in 'yyyy-MM-dd' format")
+ .validate(value => {
+ val reportDateMatcher: Regex = "^\\d{4}-\\d{2}-\\d{2}$".r
+ reportDateMatcher.findFirstIn(value) match {
+ case None => failure(s"Match error in '$value'. Option --report-date expects a date in 'yyyy-MM-dd' format")
+ case _ => success
+ }
+ }),
+
+ opt[Int]('r', "report-version").optional().action((value, config) =>
+ config.withReportVersion(Some(value)))
+ .text("Report version. If not provided, it is inferred based on the publish path (it's an EXPERIMENTAL feature)")
+ .validate(value =>
+ if (value > 0) {
+ success
+ } else {
+ failure("Option --report-version must be >0")
+ }),
+
+ opt[String]("menas-credentials-file").hidden.optional().action({ (file, config) =>
+ config.withCredsFile(Option(file), new MenasPlainCredentialsFactory(file))
+ }).text("Path to Menas credentials config file."),
+
+ opt[String]("menas-auth-keytab").optional().action({ (file, config) => {
+ config.withAuthKeytab(Option(file), new MenasKerberosCredentialsFactory(file))
+ }
+ }).text("Path to keytab file used for authenticating to menas"),
+
+
+ opt[String]("performance-file").optional().action((value, config) =>
+ config.withPerformanceMetricsFile(Option(value)))
+ .text("Produce a performance metrics file at the given location (local filesystem)"),
+
+ opt[String]("folder-prefix").optional().action((value, config) =>
+ config.withFolderPrefix(Option(value))).text("Adds a folder prefix before the infoDateColumn"),
+
+ opt[String]("persist-storage-level").optional().action((value, config) =>
+ config.withPersistStorageLevel(Some(StorageLevel.fromString(value))))
+ .text("Specifies persistence storage level to use when processing data. Spark's default is MEMORY_AND_DISK."),
+
+ checkConfig { config =>
+ config.menasCredentialsFactory match {
+ case InvalidMenasCredentialsFactory => failure("No authentication method specified (e.g. --menas-auth-keytab)")
+ case _ if config.credsFile.isDefined && config.keytabFile.isDefined =>
+ failure("Only one authentication method is allowed at a time")
+ case _ => success
+ }
+ }
+ )
+ }
+ //scalastyle:on method.length
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/PathConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/PathConfig.scala
new file mode 100644
index 000000000..36f40c83e
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/PathConfig.scala
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.common.config
+
+case class PathConfig(inputPath: String, outputPath: String)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala
index 3fd3dd83a..983c20474 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala
@@ -15,51 +15,12 @@
package za.co.absa.enceladus.common.plugin
-import java.time.Instant
-
import com.typesafe.config.Config
import org.apache.log4j.LogManager
import org.apache.spark.sql.{DataFrame, SparkSession}
-import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId._
import za.co.absa.enceladus.plugins.api.postprocessor.PostProcessor
import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams
-
-object PostProcessingService {
- //scalastyle:off parameter.number
- def forStandardization(config: Config,
- datasetName: String,
- datasetVersion: Int,
- reportDate: String,
- reportVersion: Int,
- outputPath: String,
- sourceSystem: String,
- runUrls: Option[String],
- runId: Option[Int],
- uniqueRunId: Option[String],
- processingTimestamp: Instant): PostProcessingService = {
- val params = ErrorSenderPluginParams(datasetName, datasetVersion, reportDate, reportVersion, outputPath,
- Standardization, sourceSystem, runUrls, runId, uniqueRunId, processingTimestamp)
- PostProcessingService(config, params)
- }
-
- def forConformance(config: Config,
- datasetName: String,
- datasetVersion: Int,
- reportDate: String,
- reportVersion: Int,
- outputPath: String,
- sourceSystem: String,
- runUrls: Option[String],
- runId: Option[Int],
- uniqueRunId: Option[String],
- processingTimestamp: Instant): PostProcessingService = {
- val params = ErrorSenderPluginParams(datasetName, datasetVersion, reportDate, reportVersion, outputPath,
- Conformance, sourceSystem, runUrls, runId, uniqueRunId, processingTimestamp)
- PostProcessingService(config, params)
- //scalastyle:on parameter.number
- }
-
-}
+import za.co.absa.enceladus.utils.modules.SourcePhase._
case class PostProcessingService private(config: Config, additionalParams: ErrorSenderPluginParams) {
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConfCmdConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConfCmdConfig.scala
deleted file mode 100644
index 6e96039e7..000000000
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConfCmdConfig.scala
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright 2018 ABSA Group Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package za.co.absa.enceladus.conformance
-
-import org.apache.spark.storage.StorageLevel
-import scopt.OptionParser
-import za.co.absa.enceladus.dao.auth._
-
-import scala.util.matching.Regex
-
-/**
- * This is a class for configuration provided by the command line parameters
- *
- * Note: scopt requires all fields to have default values.
- * Even if a field is mandatory it needs a default value.
- */
-case class ConfCmdConfig(datasetName: String = "",
- datasetVersion: Int = 1,
- reportDate: String = "",
- reportVersion: Option[Int] = None,
- menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory,
- performanceMetricsFile: Option[String] = None,
- publishPathOverride: Option[String] = None,
- folderPrefix: Option[String] = None,
- experimentalMappingRule: Option[Boolean] = None,
- isCatalystWorkaroundEnabled: Option[Boolean] = None,
- autocleanStandardizedFolder: Option[Boolean] = None,
- persistStorageLevel: Option[StorageLevel] = None)
-
-object ConfCmdConfig {
-
- def getCmdLineArguments(args: Array[String]): ConfCmdConfig = {
- val parser = new CmdParser("spark-submit [spark options] ConformanceBundle.jar")
-
- val optionCmd = parser.parse(args, ConfCmdConfig())
- if (optionCmd.isEmpty) {
- // Wrong arguments provided, the message is already displayed
- System.exit(1)
- }
- optionCmd.get
- }
-
- private class CmdParser(programName: String) extends OptionParser[ConfCmdConfig](programName) {
- head("Dynamic Conformance", "")
-
- opt[String]('D', "dataset-name").required().action((value, config) =>
- config.copy(datasetName = value)).text("Dataset name")
-
- opt[Int]('d', "dataset-version").required().action((value, config) =>
- config.copy(datasetVersion = value)).text("Dataset version")
- .validate(value =>
- if (value > 0) {
- success
- } else {
- failure("Option --dataset-version must be >0")
- })
-
- val reportDateMatcher: Regex = "^\\d{4}-\\d{2}-\\d{2}$".r
- opt[String]('R', "report-date").required().action((value, config) =>
- config.copy(reportDate = value)).text("Report date in 'yyyy-MM-dd' format")
- .validate(value =>
- reportDateMatcher.findFirstIn(value) match {
- case None => failure(s"Match error in '$value'. Option --report-date expects a date in 'yyyy-MM-dd' format")
- case _ => success
- })
-
- opt[Int]('r', "report-version").optional().action((value, config) =>
- config.copy(reportVersion = Some(value)))
- .text("Report version. If not provided, it is inferred based on the publish path (it's an EXPERIMENTAL feature)")
- .validate(value =>
- if (value > 0) {
- success
- } else {
- failure("Option --report-version must be >0")
- })
-
- private var credsFile: Option[String] = None
- private var keytabFile: Option[String] = None
- opt[String]("menas-credentials-file").hidden.optional().action({ (file, config) =>
- credsFile = Some(file)
- config.copy(menasCredentialsFactory = new MenasPlainCredentialsFactory(file))
- }).text("Path to Menas credentials config file.").validate(path =>
- if (keytabFile.isDefined) {
- failure("Only one authentication method is allow at a time")
- } else {
- success
- })
-
- opt[String]("menas-auth-keytab").optional().action({ (file, config) =>
- keytabFile = Some(file)
- config.copy(menasCredentialsFactory = new MenasKerberosCredentialsFactory(file))
- }).text("Path to keytab file used for authenticating to menas").validate({ file =>
- if (credsFile.isDefined) {
- failure("Only one authentication method is allowed at a time")
- } else {
- success
- }
- })
-
- opt[String]("performance-file").optional().action((value, config) =>
- config.copy(performanceMetricsFile = Option(value))).text("Produce a performance metrics file at the given location (local filesystem)")
-
- opt[String]("debug-set-publish-path").optional().hidden().action((value, config) =>
- config.copy(publishPathOverride = Option(value))).text("override the path of the published data (used internally for testing)")
-
- opt[String]("folder-prefix").optional().action((value, config) =>
- config.copy(folderPrefix = Option(value))).text("Adds a folder prefix before the infoDateColumn")
-
- opt[Boolean]("experimental-mapping-rule").optional().action((value, config) =>
- config.copy(experimentalMappingRule = Option(value))).text("Use experimental optimized mapping conformance rule")
-
- opt[Boolean]("catalyst-workaround").optional().action((value, config) =>
- config.copy(isCatalystWorkaroundEnabled = Option(value))).text("Turn on or off Catalyst workaround feature. " +
- "This overrides 'conformance.catalyst.workaround' configuration value provided in 'application.conf'.")
-
- opt[Boolean]("autoclean-std-folder").optional().action((value, config) =>
- config.copy(autocleanStandardizedFolder = Option(value))).text("Deletes standardized data from HDFS once " +
- "it is successfully conformed. This overrides 'conformance.autoclean.standardized.hdfs.folder' configuration " +
- " value provided in 'application.conf'.")
-
- opt[String]("persist-storage-level").optional().action((value, config) =>
- config.copy(persistStorageLevel = Some(StorageLevel.fromString(value))))
- .text("Specifies persistence storage level to use when processing data. Spark's default is MEMORY_AND_DISK.")
-
- help("help").text("prints this usage text")
-
- checkConfig { config =>
- config.menasCredentialsFactory match {
- case InvalidMenasCredentialsFactory => failure("No authentication method specified (e.g. --menas-auth-keytab)")
- case _ => success
- }
- }
- }
-
-}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformanceExecution.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformanceExecution.scala
new file mode 100644
index 000000000..ae271768c
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformanceExecution.scala
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.conformance
+
+import java.io.{PrintWriter, StringWriter}
+
+import org.apache.spark.sql.functions.{lit, to_date}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import za.co.absa.atum.AtumImplicits
+import za.co.absa.atum.AtumImplicits._
+import za.co.absa.atum.core.Atum
+import za.co.absa.enceladus.common.Constants.{InfoDateColumn, InfoDateColumnString, InfoVersionColumn, ReportDateFormat}
+import za.co.absa.enceladus.common.RecordIdGeneration._
+import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig}
+import za.co.absa.enceladus.common.plugin.menas.MenasPlugin
+import za.co.absa.enceladus.common.{CommonJobExecution, Constants, RecordIdGeneration}
+import za.co.absa.enceladus.conformance.config.{ConformanceConfig, ConformanceParser}
+import za.co.absa.enceladus.conformance.interpreter.rules.ValidationException
+import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
+import za.co.absa.enceladus.dao.MenasDAO
+import za.co.absa.enceladus.dao.auth.MenasCredentials
+import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils
+import za.co.absa.enceladus.utils.implicits.DataFrameImplicits.DataFrameEnhancements
+import za.co.absa.enceladus.utils.modules.SourcePhase
+import za.co.absa.enceladus.utils.performance.PerformanceMetricTools
+import za.co.absa.enceladus.utils.schema.SchemaUtils
+
+import scala.util.control.NonFatal
+import scala.util.{Failure, Success, Try}
+
+trait ConformanceExecution extends CommonJobExecution {
+ private val conformanceReader = new ConformancePropertiesProvider
+ private val sourceId = SourcePhase.Conformance
+
+ protected def prepareConformance[T](preparationResult: PreparationResult)
+ (implicit dao: MenasDAO,
+ cmd: ConformanceParser[T],
+ fsUtils: FileSystemVersionUtils,
+ spark: SparkSession
+ ): Unit = {
+ // Enable Control Framework
+ import za.co.absa.atum.AtumImplicits.SparkSessionWrapper
+
+ spark.enableControlMeasuresTracking(s"${preparationResult.pathCfg.inputPath}/_INFO")
+ .setControlMeasuresWorkflow(sourceId.toString)
+
+ // Enable control framework performance optimization for pipeline-like jobs
+ Atum.setAllowUnpersistOldDatasets(true)
+
+ // Enable Menas plugin for Control Framework
+ MenasPlugin.enableMenas(
+ conf,
+ cmd.datasetName,
+ cmd.datasetVersion,
+ cmd.reportDate,
+ preparationResult.reportVersion)
+ }
+
+ protected def readConformanceInputData(pathCfg: PathConfig)(implicit spark: SparkSession): DataFrame = {
+ spark.read.parquet(pathCfg.inputPath)
+ }
+
+ protected def conform(inputData: DataFrame, preparationResult: PreparationResult)
+ (implicit spark: SparkSession, cmd: ConformanceConfig, dao: MenasDAO): DataFrame = {
+ val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf)
+
+ implicit val featureSwitcher: FeatureSwitches = conformanceReader.readFeatureSwitches()
+
+ Try {
+ handleControlInfoValidation()
+ DynamicInterpreter.interpret(preparationResult.dataset, inputData)
+ } match {
+ case Failure(e: ValidationException) =>
+ AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(sourceId.toString, e.getMessage, e.techDetails)
+ throw e
+ case Failure(NonFatal(e)) =>
+ val sw = new StringWriter
+ e.printStackTrace(new PrintWriter(sw))
+ AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(sourceId.toString, e.getMessage, sw.toString)
+ throw e
+ case Success(conformedDF) =>
+ if (SchemaUtils.fieldExists(Constants.EnceladusRecordId, conformedDF.schema)) {
+ conformedDF // no new id regeneration
+ } else {
+ RecordIdGeneration.addRecordIdColumnByStrategy(conformedDF, Constants.EnceladusRecordId, recordIdGenerationStrategy)
+ }
+ }
+ }
+
+ protected def processConformanceResult(args: Array[String],
+ result: DataFrame,
+ preparationResult: PreparationResult,
+ menasCredentials: MenasCredentials)
+ (implicit spark: SparkSession,
+ cmd: ConformanceConfig,
+ fsUtils: FileSystemVersionUtils): Unit = {
+ val cmdLineArgs: String = args.mkString(" ")
+
+ PerformanceMetricTools.addJobInfoToAtumMetadata(
+ "conform",
+ preparationResult.pathCfg.inputPath,
+ preparationResult.pathCfg.outputPath,
+ menasCredentials.username, cmdLineArgs
+ )
+
+ val withPartCols = result
+ .withColumnIfDoesNotExist(InfoDateColumn, to_date(lit(cmd.reportDate), ReportDateFormat))
+ .withColumnIfDoesNotExist(InfoDateColumnString, lit(cmd.reportDate))
+ .withColumnIfDoesNotExist(InfoVersionColumn, lit(preparationResult.reportVersion))
+
+ val recordCount = result.lastCheckpointRowCount match {
+ case None => withPartCols.count
+ case Some(p) => p
+ }
+ if (recordCount == 0) {
+ handleEmptyOutput(SourcePhase.Conformance)
+ }
+
+ // ensure the whole path but version exists
+ fsUtils.createAllButLastSubDir(preparationResult.pathCfg.outputPath)
+
+ withPartCols.write.parquet(preparationResult.pathCfg.outputPath)
+
+ val publishDirSize = fsUtils.getDirectorySize(preparationResult.pathCfg.outputPath)
+ preparationResult.performance.finishMeasurement(publishDirSize, recordCount)
+ PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(
+ spark,
+ "conform",
+ preparationResult.pathCfg.inputPath,
+ preparationResult.pathCfg.outputPath,
+ menasCredentials.username, cmdLineArgs
+ )
+
+ withPartCols.writeInfoFile(preparationResult.pathCfg.outputPath)
+ writePerformanceMetrics(preparationResult.performance, cmd)
+
+ if (conformanceReader.isAutocleanStdFolderEnabled()) {
+ fsUtils.deleteDirectoryRecursively(preparationResult.pathCfg.inputPath)
+ }
+ log.info(s"$sourceId finished successfully")
+ }
+
+ override protected def getPathCfg[T](cmd: JobConfigParser[T], conformance: Dataset, reportVersion: Int): PathConfig = {
+ val confCmd = cmd.asInstanceOf[ConformanceParser[T]]
+ PathConfig(
+ outputPath = buildPublishPath(confCmd, conformance, reportVersion),
+ inputPath = getStandardizationPath(cmd, reportVersion)
+ )
+ }
+
+ def buildPublishPath[T](cmd: ConformanceParser[T],
+ ds: Dataset,
+ reportVersion: Int): String = {
+ val infoDateCol: String = InfoDateColumn
+ val infoVersionCol: String = InfoVersionColumn
+
+ (cmd.publishPathOverride, cmd.folderPrefix) match {
+ case (None, None) =>
+ s"${ds.hdfsPublishPath}/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion"
+ case (None, Some(folderPrefix)) =>
+ s"${ds.hdfsPublishPath}/$folderPrefix/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion"
+ case (Some(publishPathOverride), _) =>
+ publishPathOverride
+ }
+ }
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformancePropertiesProvider.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformancePropertiesProvider.scala
new file mode 100644
index 000000000..ed6be86a5
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformancePropertiesProvider.scala
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.conformance
+
+import com.typesafe.config.{Config, ConfigFactory}
+import org.slf4j.{Logger, LoggerFactory}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
+import za.co.absa.enceladus.utils.config.ConfigUtils.ConfigImplicits
+import za.co.absa.enceladus.conformance.interpreter.{FeatureSwitches, ThreeStateSwitch}
+import ConformancePropertiesProvider._
+
+/**
+ * Reads conformance properties from the configuration file
+ */
+class ConformancePropertiesProvider {
+ private val enableCF: Boolean = true
+ private val log: Logger = LoggerFactory.getLogger(this.getClass)
+ private implicit val conf: Config = ConfigFactory.load()
+
+ def isAutocleanStdFolderEnabled()(implicit cmd: ConformanceConfig): Boolean = {
+ val enabled = getCmdOrConfigBoolean(cmd.autocleanStandardizedFolder, standardizedHdfsFolderKey, defaultValue = false)
+ log.info(s"Autoclean standardized HDFS folder = $enabled")
+ enabled
+ }
+
+ def readFeatureSwitches()(implicit cmdConfig: ConformanceConfig): FeatureSwitches = FeatureSwitches()
+ .setExperimentalMappingRuleEnabled(isExperimentalRuleEnabled())
+ .setCatalystWorkaroundEnabled(isCatalystWorkaroundEnabled())
+ .setControlFrameworkEnabled(enableCF)
+ .setBroadcastStrategyMode(broadcastingStrategyMode)
+ .setBroadcastMaxSizeMb(broadcastingMaxSizeMb)
+
+ private def isExperimentalRuleEnabled()(implicit cmd: ConformanceConfig): Boolean = {
+ val enabled = getCmdOrConfigBoolean(cmd.experimentalMappingRule, experimentalRuleKey, defaultValue = false)
+ log.info(s"Experimental mapping rule enabled = $enabled")
+ enabled
+ }
+
+ private def isCatalystWorkaroundEnabled()(implicit cmd: ConformanceConfig): Boolean = {
+ val enabled = getCmdOrConfigBoolean(cmd.isCatalystWorkaroundEnabled, catalystWorkaroundKey, defaultValue = true)
+ log.info(s"Catalyst workaround enabled = $enabled")
+ enabled
+ }
+
+ private def broadcastingStrategyMode: ThreeStateSwitch = {
+ ThreeStateSwitch(conf.getString(broadcastStrategyKey))
+ }
+
+ private def broadcastingMaxSizeMb: Int = {
+ conf.getInt(maxBroadcastSizeKey)
+ }
+
+ /**
+ * Returns an effective value of a parameter according to the following priorities:
+ * - Command line arguments [highest]
+ * - Configuration file (application.conf)
+ * - Global default [lowest]
+ *
+ * @param cmdParameterOpt An optional value retrieved from command line arguments
+ * @param configKey A key in a configuration file
+ * @param defaultValue Global default value
+ * @return The effective value of the parameter
+ */
+ private def getCmdOrConfigBoolean(cmdParameterOpt: Option[Boolean], configKey: String, defaultValue: Boolean)
+ (implicit conf: Config): Boolean = {
+ val enabled = cmdParameterOpt match {
+ case Some(b) => b
+ case None => conf.getOptionBoolean(configKey).getOrElse(defaultValue)
+ }
+ enabled
+ }
+}
+
+object ConformancePropertiesProvider {
+ private val standardizedHdfsFolderKey = "conformance.autoclean.standardized.hdfs.folder"
+ private val maxBroadcastSizeKey = "conformance.mapping.rule.max.broadcast.size.mb"
+ private val experimentalRuleKey = "conformance.mapping.rule.experimental.implementation"
+ private val catalystWorkaroundKey = "conformance.catalyst.workaround"
+ private val broadcastStrategyKey = "conformance.mapping.rule.broadcast"
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala
index a590c28c9..631d3224e 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala
@@ -15,374 +15,38 @@
package za.co.absa.enceladus.conformance
-import java.io.{PrintWriter, StringWriter}
-import java.text.MessageFormat
-import java.time.Instant
-
-import com.typesafe.config.{Config, ConfigFactory}
-import org.apache.spark.sql.functions.{lit, to_date}
-import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.apache.spark.{SPARK_VERSION, sql}
-import org.slf4j.{Logger, LoggerFactory}
-import za.co.absa.atum.AtumImplicits
-import za.co.absa.atum.AtumImplicits.{DataSetWrapper, StringToPath}
-import za.co.absa.atum.core.Atum
-import za.co.absa.enceladus.common.Constants._
-import za.co.absa.enceladus.common.RecordIdGeneration._
-import za.co.absa.enceladus.common.plugin.PostProcessingService
-import za.co.absa.enceladus.common.plugin.menas.{MenasPlugin, MenasRunUrl}
-import za.co.absa.enceladus.common.version.SparkVersionGuard
-import za.co.absa.enceladus.common.{Constants, ControlInfoValidation, RecordIdGeneration}
-import za.co.absa.enceladus.conformance.interpreter.rules.ValidationException
-import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches, ThreeStateSwitch}
+import org.apache.spark.sql.SparkSession
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.dao.MenasDAO
-import za.co.absa.enceladus.dao.auth.MenasCredentials
-import za.co.absa.enceladus.dao.rest.{MenasConnectionStringParser, RestDaoFactory}
-import za.co.absa.enceladus.model.Dataset
-import za.co.absa.enceladus.utils.config.ConfigUtils.ConfigImplicits
-import za.co.absa.enceladus.utils.config.{ConfigReader, SecureConfig}
+import za.co.absa.enceladus.dao.rest.RestDaoFactory
import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils
-import za.co.absa.enceladus.utils.general.ProjectMetadataTools
-import za.co.absa.enceladus.utils.implicits.DataFrameImplicits.DataFrameEnhancements
-import za.co.absa.enceladus.utils.performance.{PerformanceMeasurer, PerformanceMetricTools}
-import za.co.absa.enceladus.utils.schema.SchemaUtils
-import za.co.absa.enceladus.utils.time.TimeZoneNormalizer
-
-import scala.util.control.NonFatal
-import scala.util.{Failure, Success, Try}
-
-object DynamicConformanceJob {
- TimeZoneNormalizer.normalizeJVMTimeZone()
+import za.co.absa.enceladus.utils.modules.SourcePhase
- private val log: Logger = LoggerFactory.getLogger(this.getClass)
- private val conf: Config = ConfigFactory.load()
- private val confReader: ConfigReader = new ConfigReader(conf)
- private val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
+object DynamicConformanceJob extends ConformanceExecution {
def main(args: Array[String]) {
// This should be the first thing the app does to make secure Kafka work with our CA.
// After Spring activates JavaX, it will be too late.
- SecureConfig.setSecureKafkaProperties(conf)
-
- SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION)
-
- confReader.logEffectiveConfigProps(Constants.ConfigKeysToRedact)
+ initialValidation()
- implicit val cmd: ConfCmdConfig = ConfCmdConfig.getCmdLineArguments(args)
+ implicit val cmd: ConformanceConfig = ConformanceConfig.getFromArguments(args)
implicit val spark: SparkSession = obtainSparkSession() // initialize spark
implicit val fsUtils: FileSystemVersionUtils = new FileSystemVersionUtils(spark.sparkContext.hadoopConfiguration)
val menasCredentials = cmd.menasCredentialsFactory.getInstance()
implicit val dao: MenasDAO = RestDaoFactory.getInstance(menasCredentials, menasBaseUrls)
- val enableCF: Boolean = true
-
- dao.authenticate()
-
- // get the dataset definition
- val conformance = dao.getDataset(cmd.datasetName, cmd.datasetVersion)
- val dateTokens = cmd.reportDate.split("-")
-
- val reportVersion = cmd.reportVersion match {
- case Some(version) => version
- case None => inferVersion(conformance.hdfsPublishPath, cmd.reportDate)
- }
-
- val pathCfg = PathCfg(
- publishPath = buildPublishPath(InfoDateColumn, InfoVersionColumn, cmd, conformance, reportVersion),
- stdPath = MessageFormat.format(conf.getString("standardized.hdfs.path"), cmd.datasetName,
- cmd.datasetVersion.toString, cmd.reportDate, reportVersion.toString)
- )
- val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf)
-
- log.info(s"stdpath = ${pathCfg.stdPath}, publishPath = ${pathCfg.publishPath}")
- // die before performing any computation if the output path already exists
- if (fsUtils.hdfsExists(pathCfg.publishPath)) {
- throw new IllegalStateException(
- s"Path ${pathCfg.publishPath} already exists. Increment the run version, or delete ${pathCfg.publishPath}")
- }
-
- initFunctionalExtensions(reportVersion, pathCfg)
- val performance = initPerformanceMeasurer(pathCfg.stdPath)
-
- // load data for input and mapping tables
- val inputData = spark.read.parquet(pathCfg.stdPath)
+ val preparationResult = prepareJob()
+ prepareConformance(preparationResult)
+ val inputData = readConformanceInputData(preparationResult.pathCfg)
try {
- val result = conform(conformance, inputData, enableCF, recordIdGenerationStrategy)
-
- PerformanceMetricTools.addJobInfoToAtumMetadata("conform",
- pathCfg.stdPath, pathCfg.publishPath, menasCredentials.username, args.mkString(" "))
+ val result = conform(inputData, preparationResult)
- processResult(result, performance, pathCfg, reportVersion, args.mkString(" "), menasCredentials)
- log.info("Conformance finished successfully")
-
- // read written data from parquet directly
- val conformedDf = spark.read.parquet(pathCfg.publishPath)
- val postProcessingService = getPostProcessingService(cmd, pathCfg, reportVersion, MenasPlugin.runNumber, Atum.getControlMeasure.runUniqueId)
- postProcessingService.onSaveOutput(conformedDf) // all enabled postProcessors will be run with the std df
+ processConformanceResult(args, result, preparationResult, menasCredentials)
+ runPostProcessing(SourcePhase.Conformance, preparationResult, cmd)
} finally {
-
- MenasPlugin.runNumber.foreach { runNumber =>
- val name = cmd.datasetName
- val version = cmd.datasetVersion
- menasBaseUrls.foreach { menasBaseUrl =>
- log.info(s"Menas API Run URL: $menasBaseUrl/api/runs/$name/$version/$runNumber")
- log.info(s"Menas UI Run URL: $menasBaseUrl/#/runs/$name/$version/$runNumber")
- }
- }
- }
- }
-
- private def getPostProcessingService(cmd: ConfCmdConfig, pathCfg: PathCfg, reportVersion: Int,
- runNumber: Option[Int], uniqueRunId: Option[String]
- )(implicit fsUtils: FileSystemVersionUtils): PostProcessingService = {
- val runId = MenasPlugin.runNumber
-
- if (runId.isEmpty) {
- log.warn("No run number found, the Run URL cannot be properly reported!")
- }
-
- // reporting the UI url(s) - if more than one, its comma-separated
- val runUrl: Option[String] = runId.map { runNumber =>
- menasBaseUrls.map { menasBaseUrl =>
- MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, cmd.datasetName, cmd.datasetVersion, runNumber)
- }.mkString(",")
- }
-
- PostProcessingService.forConformance(conf, cmd.datasetName, cmd.datasetVersion, cmd.reportDate,
- reportVersion, pathCfg.publishPath, Atum.getControlMeasure.metadata.sourceApplication, runUrl,
- runId, uniqueRunId, Instant.now)
- }
-
- private def isExperimentalRuleEnabled()(implicit cmd: ConfCmdConfig): Boolean = {
- val enabled = getCmdOrConfigBoolean(cmd.experimentalMappingRule,
- "conformance.mapping.rule.experimental.implementation",
- defaultValue = false)
- log.info(s"Experimental mapping rule enabled = $enabled")
- enabled
- }
-
- private def isCatalystWorkaroundEnabled()(implicit cmd: ConfCmdConfig): Boolean = {
- val enabled = getCmdOrConfigBoolean(cmd.isCatalystWorkaroundEnabled,
- "conformance.catalyst.workaround",
- defaultValue = true)
- log.info(s"Catalyst workaround enabled = $enabled")
- enabled
- }
-
- private def isAutocleanStdFolderEnabled()(implicit cmd: ConfCmdConfig): Boolean = {
- val enabled = getCmdOrConfigBoolean(cmd.autocleanStandardizedFolder,
- "conformance.autoclean.standardized.hdfs.folder",
- defaultValue = false)
- log.info(s"Autoclean standardized HDFS folder = $enabled")
- enabled
- }
-
- private def broadcastingStrategyMode: ThreeStateSwitch = {
- ThreeStateSwitch(conf.getString("conformance.mapping.rule.broadcast"))
- }
-
- private def broadcastingMaxSizeMb: Int = {
- conf.getInt("conformance.mapping.rule.max.broadcast.size.mb")
- }
-
- /**
- * Returns an effective value of a parameter according to the following priorities:
- * - Command line arguments [highest]
- * - Configuration file (application.conf)
- * - Global default [lowest]
- *
- * @param cmdParameterOpt An optional value retrieved from command line arguments
- * @param configKey A key in a configuration file
- * @param defaultValue Global default value
- * @return The effective value of the parameter
- */
- private def getCmdOrConfigBoolean(cmdParameterOpt: Option[Boolean],
- configKey: String,
- defaultValue: Boolean): Boolean = {
- val enabled = cmdParameterOpt match {
- case Some(b) => b
- case None =>
- conf.getOptionBoolean(configKey).getOrElse(defaultValue)
- }
- enabled
- }
-
- private def obtainSparkSession()(implicit cmd: ConfCmdConfig): SparkSession = {
- val enceladusVersion = ProjectMetadataTools.getEnceladusVersion
- log.info(s"Enceladus version $enceladusVersion")
- val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("")
- val spark: SparkSession = SparkSession.builder()
- .appName(s"Dynamic Conformance $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion")
- .getOrCreate()
-
- TimeZoneNormalizer.normalizeSessionTimeZone(spark)
- spark
- }
-
- private def inferVersion(hdfsPublishPath: String, reportDate: String)
- (implicit fsUtils: FileSystemVersionUtils):Int = {
- val newVersion = fsUtils.getLatestVersion(hdfsPublishPath, reportDate) + 1
- log.warn(s"Report version not provided, inferred report version: $newVersion")
- log.warn("This is an EXPERIMENTAL feature.")
- log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.")
- log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.")
- newVersion
- }
-
- private def initFunctionalExtensions(reportVersion: Int, pathCfg: PathCfg)(implicit spark: SparkSession,
- dao: MenasDAO,
- cmd: ConfCmdConfig): Unit = {
- // Enable Spline
- import za.co.absa.spline.core.SparkLineageInitializer._
- spark.enableLineageTracking()
-
- // Enable Control Framework
- import za.co.absa.atum.AtumImplicits.SparkSessionWrapper
- spark.enableControlMeasuresTracking(s"${pathCfg.stdPath}/_INFO")
- .setControlMeasuresWorkflow("Conformance")
-
- // Enable control framework performance optimization for pipeline-like jobs
- Atum.setAllowUnpersistOldDatasets(true)
-
- // Enable non-default persistence storage level if provided in the command line
- cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel)
-
- // Enable Menas plugin for Control Framework
- MenasPlugin.enableMenas(conf, cmd.datasetName, cmd.datasetVersion, cmd.reportDate, reportVersion)
- }
-
- private def initPerformanceMeasurer(stdPath: String)
- (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils): PerformanceMeasurer = {
- // init performance measurer
- val performance = new PerformanceMeasurer(spark.sparkContext.appName)
- val stdDirSize = fsUtils.getDirectorySize(stdPath)
- performance.startMeasurement(stdDirSize)
- performance
- }
-
- private def conform(conformance: Dataset, inputData: sql.Dataset[Row], enableCF: Boolean, recordIdGenerationStrategy: IdType)
- (implicit spark: SparkSession, cmd: ConfCmdConfig, fsUtils: FileSystemVersionUtils, dao: MenasDAO): DataFrame = {
- implicit val featureSwitcher: FeatureSwitches = FeatureSwitches()
- .setExperimentalMappingRuleEnabled(isExperimentalRuleEnabled())
- .setCatalystWorkaroundEnabled(isCatalystWorkaroundEnabled())
- .setControlFrameworkEnabled(enableCF)
- .setBroadcastStrategyMode(broadcastingStrategyMode)
- .setBroadcastMaxSizeMb(broadcastingMaxSizeMb)
-
- Try {
- handleControlInfoValidation()
- DynamicInterpreter.interpret(conformance, inputData)
- } match {
- case Failure(e: ValidationException) =>
- AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Conformance", e.getMessage, e.techDetails)
- throw e
- case Failure(NonFatal(e)) =>
- val sw = new StringWriter
- e.printStackTrace(new PrintWriter(sw))
- AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Conformance", e.getMessage, sw.toString)
- throw e
- case Success(conformedDF) =>
- if (SchemaUtils.fieldExists(Constants.EnceladusRecordId, conformedDF.schema)) {
- conformedDF // no new id regeneration
- } else {
- RecordIdGeneration.addRecordIdColumnByStrategy(conformedDF, Constants.EnceladusRecordId, recordIdGenerationStrategy)
- }
-
+ finishJob(cmd)
}
}
-
- private def processResult(result: DataFrame,
- performance: PerformanceMeasurer,
- pathCfg: PathCfg,
- reportVersion: Int,
- cmdLineArgs: String,
- menasCredentials: MenasCredentials)
- (implicit spark: SparkSession, cmd: ConfCmdConfig, fsUtils: FileSystemVersionUtils): Unit = {
- val withPartCols = result
- .withColumnIfDoesNotExist(InfoDateColumn, to_date(lit(cmd.reportDate), ReportDateFormat))
- .withColumnIfDoesNotExist(InfoDateColumnString, lit(cmd.reportDate))
- .withColumnIfDoesNotExist(InfoVersionColumn, lit(reportVersion))
-
- val recordCount = result.lastCheckpointRowCount match {
- case None => withPartCols.count
- case Some(p) => p
- }
- if (recordCount == 0) { handleEmptyOutputAfterConformance() }
-
- // ensure the whole path but version exists
- fsUtils.createAllButLastSubDir(pathCfg.publishPath)
-
- withPartCols.write.parquet(pathCfg.publishPath)
-
- val publishDirSize = fsUtils.getDirectorySize(pathCfg.publishPath)
- performance.finishMeasurement(publishDirSize, recordCount)
- PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(spark, "conform",
- pathCfg.stdPath, pathCfg.publishPath, menasCredentials.username, cmdLineArgs)
-
- withPartCols.writeInfoFile(pathCfg.publishPath)
- cmd.performanceMetricsFile.foreach(fileName => {
- try {
- performance.writeMetricsToFile(fileName)
- } catch {
- case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}")
- }
- })
-
- if (isAutocleanStdFolderEnabled()) {
- fsUtils.deleteDirectoryRecursively(pathCfg.stdPath)
- }
- }
-
- private def handleEmptyOutputAfterConformance()(implicit spark: SparkSession): Unit = {
- import za.co.absa.atum.core.Constants._
-
- val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints
- .flatMap(checkpoint =>
- checkpoint.controls.filter(control =>
- control.controlName.equalsIgnoreCase(controlTypeRecordCount)))
- .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D))
-
- if (areCountMeasurementsAllZero) {
- log.warn("Empty output after running Dynamic Conformance. Previous checkpoints show this is correct.")
- } else {
- val errMsg = "Empty output after running Dynamic Conformance, " +
- "while previous checkpoints show non zero record count"
- AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Standardization", errMsg, "")
- throw new IllegalStateException(errMsg)
- }
- }
-
- private def handleControlInfoValidation(): Unit = {
- ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match {
- case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) => {
- val confEntry = "control.info.validation"
- conf.getString(confEntry) match {
- case "strict" => throw ex
- case "warning" => log.warn(ex.msg)
- case "none" =>
- case _ => throw new RuntimeException(s"Invalid $confEntry value")
- }
- }
- case Failure(ex) => throw ex
- case Success(_) =>
- }
- }
-
- def buildPublishPath(infoDateCol: String,
- infoVersionCol: String,
- cmd: ConfCmdConfig,
- ds: Dataset,
- reportVersion: Int): String = {
- (cmd.publishPathOverride, cmd.folderPrefix) match {
- case (None, None) =>
- s"${ds.hdfsPublishPath}/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion"
- case (None, Some(folderPrefix)) =>
- s"${ds.hdfsPublishPath}/$folderPrefix/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion"
- case (Some(publishPathOverride), _) =>
- publishPathOverride
- }
- }
-
- private final case class PathCfg(publishPath: String, stdPath: String)
}
+
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala
index 61a915830..255cf880a 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala
@@ -21,11 +21,11 @@ import java.util.Date
import org.apache.commons.configuration2.Configuration
import org.apache.spark.SPARK_VERSION
import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.DateType
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import za.co.absa.enceladus.common.Constants._
import za.co.absa.enceladus.common.version.SparkVersionGuard
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{Always, DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.conformance.streaming.InfoDateFactory
import za.co.absa.enceladus.dao.MenasDAO
@@ -34,7 +34,7 @@ import za.co.absa.enceladus.dao.rest.{MenasConnectionStringParser, RestDaoFactor
import za.co.absa.enceladus.model.Dataset
import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformer, StreamTransformerFactory}
-class HyperConformance (implicit cmd: ConfCmdConfig,
+class HyperConformance (implicit cmd: ConformanceConfig,
featureSwitches: FeatureSwitches,
menasBaseUrls: List[String],
infoDateFactory: InfoDateFactory) extends StreamTransformer {
@@ -77,7 +77,7 @@ class HyperConformance (implicit cmd: ConfCmdConfig,
}
@throws[IllegalArgumentException]
- private def getReportVersion(implicit cmd: ConfCmdConfig): Int = {
+ private def getReportVersion(implicit cmd: ConformanceConfig): Int = {
cmd.reportVersion match {
case Some(version) => version
case None => throw new IllegalArgumentException("Report version is not provided.")
@@ -121,19 +121,14 @@ object HyperConformance extends StreamTransformerFactory with HyperConformanceAt
val menasCredentialsFactory = getMenasCredentialsFactory(conf: Configuration)
- implicit val cmd: ConfCmdConfig = ConfCmdConfig(
- datasetName = conf.getString(datasetNameKey),
- datasetVersion = conf.getInt(datasetVersionKey),
- reportDate = new SimpleDateFormat(ReportDateFormat).format(new Date()), // Still need a report date for mapping table patterns
- reportVersion = Option(getReportVersion(conf)),
- menasCredentialsFactory = menasCredentialsFactory,
- performanceMetricsFile = None,
- publishPathOverride = None,
- folderPrefix = None,
+ implicit val confConfig: ConformanceConfig = ConformanceConfig(publishPathOverride = None,
experimentalMappingRule = Some(true),
isCatalystWorkaroundEnabled = Some(true),
autocleanStandardizedFolder = Some(false),
- persistStorageLevel = None
+ datasetName = conf.getString(datasetNameKey),
+ datasetVersion = conf.getInt(datasetVersionKey),
+ reportDate = new SimpleDateFormat(ReportDateFormat).format(new Date()),
+ menasCredentialsFactory = menasCredentialsFactory
)
implicit val featureSwitcher: FeatureSwitches = FeatureSwitches()
@@ -172,13 +167,4 @@ object HyperConformance extends StreamTransformerFactory with HyperConformanceAt
case (true, true) => throw new IllegalArgumentException("Either a credentials file or a keytab should be specified, but not both.")
}
}
-
- private def getReportVersion(conf: Configuration): Int = {
- if (conf.containsKey(reportVersionKey)) {
- conf.getInt(reportVersionKey)
- } else {
- defaultReportVersion
- }
- }
}
-
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceConfig.scala
new file mode 100644
index 000000000..4224f871f
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceConfig.scala
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.conformance.config
+
+import org.apache.spark.storage.StorageLevel
+import scopt.OParser
+import za.co.absa.enceladus.common.config.{ConfigError, JobConfigParser}
+import za.co.absa.enceladus.dao.auth.{InvalidMenasCredentialsFactory, MenasCredentialsFactory}
+
+import scala.util.Try
+
+
+/**
+ * This is a class for configuration provided by the command line parameters
+ *
+ * Note: scopt requires all fields to have default values.
+ * Even if a field is mandatory it needs a default value.
+ */
+case class ConformanceConfig(datasetName: String = "",
+ datasetVersion: Int = 1,
+ reportDate: String = "",
+ reportVersion: Option[Int] = None,
+ menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory,
+ performanceMetricsFile: Option[String] = None,
+ folderPrefix: Option[String] = None,
+ persistStorageLevel: Option[StorageLevel] = None,
+ publishPathOverride: Option[String] = None,
+ experimentalMappingRule: Option[Boolean] = None,
+ isCatalystWorkaroundEnabled: Option[Boolean] = None,
+ autocleanStandardizedFolder: Option[Boolean] = None,
+ credsFile: Option[String] = None,
+ keytabFile: Option[String] = None)
+ extends ConformanceParser[ConformanceConfig] {
+
+ override def withPublishPathOverride(value: Option[String]): ConformanceConfig = copy(publishPathOverride = value)
+ override def withExperimentalMappingRule(value: Option[Boolean]): ConformanceConfig = copy(experimentalMappingRule = value)
+ override def withIsCatalystWorkaroundEnabled(value: Option[Boolean]): ConformanceConfig =
+ copy(isCatalystWorkaroundEnabled = value)
+ override def withAutocleanStandardizedFolder(value: Option[Boolean]): ConformanceConfig =
+ copy(autocleanStandardizedFolder = value)
+ override def withDatasetName(value: String): ConformanceConfig = copy(datasetName = value)
+ override def withDatasetVersion(value: Int): ConformanceConfig = copy(datasetVersion = value)
+ override def withReportDate(value: String): ConformanceConfig = copy(reportDate = value)
+ override def withReportVersion(value: Option[Int]): ConformanceConfig = copy(reportVersion = value)
+ override def withCredsFile(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): ConformanceConfig =
+ copy(credsFile = value, menasCredentialsFactory = menasCredentialsFactory)
+
+ override def withAuthKeytab(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): ConformanceConfig =
+ copy(keytabFile = value, menasCredentialsFactory = menasCredentialsFactory)
+
+ override def withPerformanceMetricsFile(value: Option[String]): ConformanceConfig = copy(performanceMetricsFile = value)
+ override def withFolderPrefix(value: Option[String]): ConformanceConfig = copy(folderPrefix = value)
+ override def withPersistStorageLevel(value: Option[StorageLevel]): ConformanceConfig = copy(persistStorageLevel = value)
+}
+
+object ConformanceConfig {
+ def tryFromArguments(args: Array[String]): Try[ConformanceConfig] = {
+ import za.co.absa.enceladus.utils.implicits.OptionImplicits._
+ OParser.parse(conformanceJobParser, args, ConformanceConfig()).toTry(ConfigError("Command line parameters error"))
+ }
+
+ def getFromArguments(args: Array[String]): ConformanceConfig = tryFromArguments(args).get
+
+ val conformanceJobParser: OParser[_, ConformanceConfig] = {
+ val builder = OParser.builder[ConformanceConfig]
+ import builder._
+ OParser.sequence(
+ programName("Conformance Job"),
+ ConformanceParser.conformanceParser,
+ JobConfigParser.jobConfigParser
+ )
+ }
+
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceParser.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceParser.scala
new file mode 100644
index 000000000..4e3d05680
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceParser.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.conformance.config
+
+import scopt.OParser
+import za.co.absa.enceladus.common.config.JobConfigParser
+
+trait ConformanceParser[R] extends JobConfigParser[R] {
+ def publishPathOverride: Option[String]
+ def experimentalMappingRule: Option[Boolean]
+ def isCatalystWorkaroundEnabled: Option[Boolean]
+ def autocleanStandardizedFolder: Option[Boolean]
+
+ def withPublishPathOverride(vlue: Option[String]): R
+ def withExperimentalMappingRule(value: Option[Boolean]): R
+ def withIsCatalystWorkaroundEnabled(value: Option[Boolean]): R
+ def withAutocleanStandardizedFolder(value: Option[Boolean]): R
+}
+
+object ConformanceParser {
+
+ def conformanceParser[R <: ConformanceParser[R]]: OParser[_, R] = {
+ val builder = OParser.builder[R]
+ import builder._
+ OParser.sequence(
+ head("Dynamic Conformance", ""),
+
+ opt[String]("debug-set-publish-path").optional().hidden().action((value, config) =>
+ config.withPublishPathOverride(Some(value))).text("override the path of the published data (used internally for testing)"),
+
+ opt[Boolean]("experimental-mapping-rule").optional().action((value, config) =>
+ config.withExperimentalMappingRule(Option(value))).text("Use experimental optimized mapping conformance rule"),
+
+ opt[Boolean]("catalyst-workaround").optional().action((value, config) =>
+ config.withIsCatalystWorkaroundEnabled(Some(value))).text("Turn on or off Catalyst workaround feature. " +
+ "This overrides 'conformance.catalyst.workaround' configuration value provided in 'application.conf'."),
+
+ opt[Boolean]("autoclean-std-folder").optional().action((value, config) =>
+ config.withAutocleanStandardizedFolder(Option(value))).text("Deletes standardized data from HDFS once " +
+ "it is successfully conformed. This overrides 'conformance.autoclean.standardized.hdfs.folder' configuration " +
+ " value provided in 'application.conf'.")
+ )
+ }
+}
+
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala
index 10cc11e5f..34f55bb72 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.LoggerFactory
import za.co.absa.atum.AtumImplicits._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.PartitioningUtils
import za.co.absa.enceladus.conformance.interpreter.rules._
import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule
@@ -51,7 +51,7 @@ object DynamicInterpreter {
*
*/
def interpret(conformance: ConfDataset, inputDf: Dataset[Row], jobShortName: String = "Conformance")
- (implicit spark: SparkSession, dao: MenasDAO, progArgs: ConfCmdConfig, featureSwitches: FeatureSwitches): DataFrame = {
+ (implicit spark: SparkSession, dao: MenasDAO, progArgs: ConformanceConfig, featureSwitches: FeatureSwitches): DataFrame = {
implicit val interpreterContext: InterpreterContext = InterpreterContext(inputDf.schema, conformance,
featureSwitches, jobShortName, spark, dao, progArgs)
@@ -76,7 +76,7 @@ object DynamicInterpreter {
(implicit ictx: InterpreterContext): DataFrame = {
implicit val spark: SparkSession = ictx.spark
implicit val dao: MenasDAO = ictx.dao
- implicit val progArgs: ConfCmdConfig = ictx.progArgs
+ implicit val progArgs: ConformanceConfig = ictx.progArgs
implicit val udfLib: UDFLibrary = new UDFLibrary
implicit val explosionState: ExplosionState = new ExplosionState()
@@ -266,7 +266,8 @@ object DynamicInterpreter {
val fsUtils = new FileSystemVersionUtils(ictx.spark.sparkContext.hadoopConfiguration)
val mappingTableDef = ictx.dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion)
- val mappingTablePath = PartitioningUtils.getPartitionedPathName(mappingTableDef.hdfsPath, ictx.progArgs.reportDate)
+ val mappingTablePath = PartitioningUtils.getPartitionedPathName(mappingTableDef.hdfsPath,
+ ictx.progArgs.reportDate)
val mappingTableSize = fsUtils.getDirectorySizeNoHidden(mappingTablePath)
(mappingTableSize / (1024 * 1024)).toInt
}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala
index 99cbd648a..87c0319cf 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala
@@ -17,7 +17,7 @@ package za.co.absa.enceladus.conformance.interpreter
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.{Dataset => ConfDataset}
@@ -29,5 +29,5 @@ case class InterpreterContext (
jobShortName: String,
spark: SparkSession,
dao: MenasDAO,
- progArgs: ConfCmdConfig
+ progArgs: ConformanceConfig
)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala
index ad7c4f066..7d13b05ec 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala
@@ -16,7 +16,7 @@
package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.{Dataset, Row, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.ConformanceRule
@@ -31,7 +31,7 @@ class ArrayCollapseInterpreter extends RuleInterpreter {
override def conformanceRule: Option[ConformanceRule] = None
override def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
val dfOut = ExplodeTools.revertAllExplosions(df, explosionState.explodeContext, Some(ErrorMessage.errorColumnName))
explosionState.explodeContext = ExplosionContext()
dfOut
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala
index 09edc7453..92356ad19 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala
@@ -16,7 +16,7 @@
package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.{Dataset, Row, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.ConformanceRule
@@ -29,7 +29,7 @@ class ArrayExplodeInterpreter(columnName: String) extends RuleInterpreter {
override def conformanceRule: Option[ConformanceRule] = None
override def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
val (dfOut, ctx) = ExplodeTools.explodeAllArraysInPath(columnName, df, explosionState.explodeContext)
explosionState.explodeContext = ctx
dfOut
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala
index 79a20a58e..a505f92d5 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala
@@ -19,8 +19,8 @@ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{CastingConformanceRule, ConformanceRule}
@@ -35,7 +35,7 @@ case class CastingRuleInterpreter(rule: CastingConformanceRule) extends RuleInte
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
// Validate the rule parameters
RuleValidators.validateInputField(progArgs.datasetName, ruleName, df.schema, rule.inputColumn)
RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala
index 2cf9a0200..8d966f5f4 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala
@@ -18,8 +18,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConcatenationConformanceRule, ConformanceRule}
@@ -31,7 +31,7 @@ case class ConcatenationRuleInterpreter(rule: ConcatenationConformanceRule) exte
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
// Validate the rule parameters
RuleValidators.validateSameParent(progArgs.datasetName, ruleName, rule.inputColumns :+ rule.outputColumn: _*)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala
index e19354359..8600b7af9 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala
@@ -16,8 +16,8 @@
package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, DropConformanceRule}
@@ -28,7 +28,7 @@ case class DropRuleInterpreter(rule: DropConformanceRule) extends RuleInterprete
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
if (SchemaUtils.fieldExists(rule.outputColumn, df.schema)) {
if (rule.outputColumn.contains('.')) {
conformNestedField(df)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala
index a48482818..436b4ec58 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala
@@ -16,8 +16,8 @@
package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, LiteralConformanceRule}
@@ -29,7 +29,7 @@ case class LiteralRuleInterpreter(rule: LiteralConformanceRule) extends RuleInte
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
// Validate the rule parameters
RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala
index 688d4d880..dabdb4d89 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql.api.java.UDF1
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, Dataset, Row, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
@@ -40,7 +40,7 @@ case class MappingRuleInterpreter(rule: MappingConformanceRule, conformance: Con
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
log.info(s"Processing mapping rule to conform ${rule.outputColumn}...")
import spark.implicits._
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala
index d42ee8698..3b3712d84 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala
@@ -16,7 +16,7 @@
package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.{Dataset, Row, SparkSession}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.dao.MenasDAO
@@ -31,7 +31,7 @@ case class MappingRuleInterpreterBroadcast(rule: MappingConformanceRule, conform
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
log.info(s"Processing mapping rule to conform ${rule.outputColumn} (broadcast strategy)...")
val mappingTableDef = dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala
index 7cf4389c4..8d66c20b2 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala
@@ -18,7 +18,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.conformance.interpreter.rules.MappingRuleInterpreterGroupExplode._
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
@@ -42,7 +42,7 @@ case class MappingRuleInterpreterGroupExplode(rule: MappingConformanceRule,
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
log.info(s"Processing mapping rule (explode-optimized) to conform ${rule.outputColumn}...")
val mappingTableDef = dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala
index 37adc6b25..95ac3288d 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala
@@ -18,8 +18,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, NegationConformanceRule}
@@ -34,7 +34,7 @@ case class NegationRuleInterpreter(rule: NegationConformanceRule) extends RuleIn
override def conformanceRule: Option[ConformanceRule] = Some(rule)
override def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
NegationRuleInterpreter.validateInputField(progArgs.datasetName, df.schema, rule.inputColumn)
val field = SchemaUtils.getField(rule.inputColumn, df.schema).get
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala
index 02eb840ab..f87744144 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala
@@ -18,7 +18,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Column, Dataset, Row, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.ConformanceRule
@@ -45,7 +45,7 @@ trait RuleInterpreter {
* @return A conformed DataFrame
*/
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row]
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row]
protected val log: Logger = LoggerFactory.getLogger(this.getClass)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala
index f44b392dd..4de101522 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala
@@ -17,8 +17,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, SingleColumnConformanceRule}
@@ -30,7 +30,7 @@ case class SingleColumnRuleInterpreter(rule: SingleColumnConformanceRule) extend
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
// Validate the rule parameters
RuleValidators.validateFieldExistence(progArgs.datasetName,ruleName, df.schema, rule.inputColumn)
RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala
index 29a0ed8fb..9f8384c48 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala
@@ -16,8 +16,8 @@
package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, SparkSessionConfConformanceRule}
@@ -29,7 +29,7 @@ case class SparkSessionConfRuleInterpreter(rule: SparkSessionConfConformanceRule
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
// Validate the rule parameters
RuleValidators.validateOutputField(ruleName, progArgs.datasetName, df.schema, rule.outputColumn)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala
index 8e611aecc..ba39dd598 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala
@@ -17,8 +17,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.spark.hats.Extensions._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, UppercaseConformanceRule}
@@ -30,7 +30,7 @@ case class UppercaseRuleInterpreter(rule: UppercaseConformanceRule) extends Rule
override def conformanceRule: Option[ConformanceRule] = Some(rule)
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
// Validate the rule parameters
RuleValidators.validateInputField(progArgs.datasetName, ruleName, df.schema, rule.inputColumn)
RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn)
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationExecution.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationExecution.scala
new file mode 100644
index 000000000..a39085688
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationExecution.scala
@@ -0,0 +1,260 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.standardization
+
+import java.io.{PrintWriter, StringWriter}
+import java.util.UUID
+
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.sql.{Column, DataFrame, SparkSession}
+import za.co.absa.atum.AtumImplicits
+import za.co.absa.atum.core.Atum
+import za.co.absa.enceladus.common.RecordIdGeneration.getRecordIdGenerationStrategyFromConfig
+import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig}
+import za.co.absa.enceladus.common.plugin.menas.MenasPlugin
+import za.co.absa.enceladus.common.{CommonJobExecution, Constants}
+import za.co.absa.enceladus.dao.MenasDAO
+import za.co.absa.enceladus.dao.auth.MenasCredentials
+import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.{StandardizationParser, StandardizationConfig}
+import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter
+import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator
+import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils
+import za.co.absa.enceladus.utils.modules.SourcePhase
+import za.co.absa.enceladus.utils.performance.PerformanceMetricTools
+import za.co.absa.enceladus.utils.schema.{MetadataKeys, SchemaUtils, SparkUtils}
+import za.co.absa.enceladus.utils.udf.UDFLibrary
+import za.co.absa.enceladus.utils.validation.ValidationException
+
+import scala.util.control.NonFatal
+
+trait StandardizationExecution extends CommonJobExecution {
+ private val sourceId = SourcePhase.Standardization
+
+ protected def prepareStandardization[T](args: Array[String],
+ menasCredentials: MenasCredentials,
+ preparationResult: PreparationResult
+ )
+ (implicit dao: MenasDAO,
+ cmd: StandardizationParser[T],
+ fsUtils: FileSystemVersionUtils,
+ spark: SparkSession): StructType = {
+
+ // Enable Control Framework
+ import za.co.absa.atum.AtumImplicits.SparkSessionWrapper
+ spark.enableControlMeasuresTracking(s"${preparationResult.pathCfg.inputPath}/_INFO")
+ .setControlMeasuresWorkflow(sourceId.toString)
+
+ // Enable control framework performance optimization for pipeline-like jobs
+ Atum.setAllowUnpersistOldDatasets(true)
+
+ // Enable Menas plugin for Control Framework
+ MenasPlugin.enableMenas(
+ conf,
+ cmd.datasetName,
+ cmd.datasetVersion,
+ cmd.reportDate,
+ preparationResult.reportVersion)
+
+ // Add report date and version (aka Enceladus info date and version) to Atum's metadata
+ Atum.setAdditionalInfo(Constants.InfoDateColumn -> cmd.reportDate)
+ Atum.setAdditionalInfo(Constants.InfoVersionColumn -> preparationResult.reportVersion.toString)
+
+ // Add the raw format of the input file(s) to Atum's metadata
+ Atum.setAdditionalInfo("raw_format" -> cmd.rawFormat)
+
+ PerformanceMetricTools.addJobInfoToAtumMetadata("std", preparationResult.pathCfg.inputPath, preparationResult.pathCfg.outputPath,
+ menasCredentials.username, args.mkString(" "))
+
+ dao.getSchema(preparationResult.dataset.schemaName, preparationResult.dataset.schemaVersion)
+ }
+
+ protected def readStandardizationInputData(schema: StructType,
+ cmd: StandardizationConfig,
+ path: String,
+ dataset: Dataset)
+ (implicit spark: SparkSession,
+ fsUtils: FileSystemVersionUtils,
+ dao: MenasDAO): DataFrame = {
+ val numberOfColumns = schema.fields.length
+ val standardizationReader = new StandardizationPropertiesProvider()
+ val dfReaderConfigured = standardizationReader.getFormatSpecificReader(cmd, dataset, numberOfColumns)
+ val readerWithOptSchema = cmd.rawFormat.toLowerCase() match {
+ case "parquet" | "cobol" => dfReaderConfigured
+ case _ =>
+ val optColumnNameOfCorruptRecord = getColumnNameOfCorruptRecord(schema, cmd)
+ val inputSchema = PlainSchemaGenerator.generateInputSchema(schema, optColumnNameOfCorruptRecord)
+ dfReaderConfigured.schema(inputSchema)
+ }
+ val dfWithSchema = readerWithOptSchema.load(s"$path/*")
+
+ ensureSplittable(dfWithSchema, path, schema)
+ }
+
+
+ private def getColumnNameOfCorruptRecord[R](schema: StructType, cmd: StandardizationParser[R])
+ (implicit spark: SparkSession): Option[String] = {
+ // SparkUtils.setUniqueColumnNameOfCorruptRecord is called even if result is not used to avoid conflict
+ val columnNameOfCorruptRecord = SparkUtils.setUniqueColumnNameOfCorruptRecord(spark, schema)
+ if (cmd.rawFormat.equalsIgnoreCase("fixed-width") || cmd.failOnInputNotPerSchema) {
+ None
+ } else {
+ Option(columnNameOfCorruptRecord)
+ }
+ }
+
+ protected def standardize(inputData: DataFrame, schema: StructType, cmd: StandardizationConfig)
+ (implicit spark: SparkSession, udfLib: UDFLibrary): DataFrame = {
+ //scalastyle:on parameter.number
+ val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf)
+
+ try {
+ handleControlInfoValidation()
+ StandardizationInterpreter.standardize(inputData, schema, cmd.rawFormat,
+ cmd.failOnInputNotPerSchema, recordIdGenerationStrategy)
+ } catch {
+ case e@ValidationException(msg, errors) =>
+ val errorDescription = s"$msg\nDetails: ${errors.mkString("\n")}"
+ AtumImplicits.SparkSessionWrapper(spark)
+ .setControlMeasurementError("Schema Validation", errorDescription, "")
+ throw e
+ case NonFatal(e) if !e.isInstanceOf[ValidationException] =>
+ val sw = new StringWriter
+ e.printStackTrace(new PrintWriter(sw))
+ AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(sourceId.toString, e.getMessage, sw.toString)
+ throw e
+ }
+ }
+
+ protected def processStandardizationResult(args: Array[String],
+ standardizedDF: DataFrame,
+ preparationResult: PreparationResult,
+ schema: StructType,
+ cmd: StandardizationConfig,
+ menasCredentials: MenasCredentials)
+ (implicit spark: SparkSession,
+ fsUtils: FileSystemVersionUtils): Unit = {
+ import za.co.absa.atum.AtumImplicits._
+ val fieldRenames = SchemaUtils.getRenamesInSchema(schema)
+ fieldRenames.foreach {
+ case (destinationName, sourceName) => standardizedDF.registerColumnRename(sourceName, destinationName)
+ }
+
+ standardizedDF.setCheckpoint(s"$sourceId - End", persistInDatabase = false)
+
+ val recordCount = standardizedDF.lastCheckpointRowCount match {
+ case None => standardizedDF.count
+ case Some(p) => p
+ }
+ if (recordCount == 0) {
+ handleEmptyOutput(sourceId)
+ }
+
+ standardizedDF.write.parquet(preparationResult.pathCfg.outputPath)
+ // Store performance metrics
+ // (record count, directory sizes, elapsed time, etc. to _INFO file metadata and performance file)
+ val stdDirSize = fsUtils.getDirectorySize(preparationResult.pathCfg.outputPath)
+ preparationResult.performance.finishMeasurement(stdDirSize, recordCount)
+ PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(
+ spark,
+ "std",
+ preparationResult.pathCfg.inputPath,
+ preparationResult.pathCfg.outputPath,
+ menasCredentials.username,
+ args.mkString(" ")
+ )
+
+ cmd.rowTag.foreach(rowTag => Atum.setAdditionalInfo("xml_row_tag" -> rowTag))
+ cmd.csvDelimiter.foreach(delimiter => Atum.setAdditionalInfo("csv_delimiter" -> delimiter))
+
+ standardizedDF.writeInfoFile(preparationResult.pathCfg.outputPath)
+ writePerformanceMetrics(preparationResult.performance, cmd)
+ log.info(s"$sourceId finished successfully")
+ }
+ //scalastyle:off parameter.number
+
+ override protected def getPathCfg[T](cmd: JobConfigParser[T], dataset: Dataset, reportVersion: Int): PathConfig = {
+ val stdCmd = cmd.asInstanceOf[StandardizationParser[T]]
+ PathConfig(
+ inputPath = buildRawPath(stdCmd, dataset, reportVersion),
+ outputPath = getStandardizationPath(cmd, reportVersion)
+ )
+ }
+
+ def buildRawPath[T](cmd: StandardizationParser[T], dataset: Dataset, reportVersion: Int): String = {
+ val dateTokens = cmd.reportDate.split("-")
+ cmd.rawPathOverride match {
+ case None =>
+ val folderSuffix = s"/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v$reportVersion"
+ cmd.folderPrefix match {
+ case None => s"${dataset.hdfsPath}$folderSuffix"
+ case Some(folderPrefix) => s"${dataset.hdfsPath}/$folderPrefix$folderSuffix"
+ }
+ case Some(rawPathOverride) => rawPathOverride
+ }
+ }
+
+ private def ensureSplittable(df: DataFrame, path: String, schema: StructType)
+ (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = {
+ if (fsUtils.isNonSplittable(path)) {
+ convertToSplittable(df, schema)
+ } else {
+ df
+ }
+ }
+
+ private def convertToSplittable(df: DataFrame, schema: StructType)
+ (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = {
+ log.warn("Dataset is stored in a non-splittable format. This can have a severe performance impact.")
+
+ val tempParquetDir = s"/tmp/nonsplittable-to-parquet-${UUID.randomUUID()}"
+ log.warn(s"Converting to Parquet in temporary dir: $tempParquetDir")
+
+ // Handle renaming of source columns in case there are columns
+ // that will break because of issues in column names like spaces
+ df.select(schema.fields.map { field: StructField =>
+ renameSourceColumn(df, field)
+ }: _*).write.parquet(tempParquetDir)
+
+ fsUtils.deleteOnExit(tempParquetDir)
+ // Reload from temp parquet and reverse column renaming above
+ val dfTmp = spark.read.parquet(tempParquetDir)
+ dfTmp.select(schema.fields.map { field: StructField =>
+ reverseRenameSourceColumn(dfTmp, field)
+ }: _*)
+ }
+
+ private def renameSourceColumn(df: DataFrame, field: StructField): Column = {
+ if (field.metadata.contains(MetadataKeys.SourceColumn)) {
+ val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn)
+ log.info(s"schema field : ${field.name} : rename : $sourceColumnName")
+ df.col(sourceColumnName).as(field.name, field.metadata)
+ } else {
+ df.col(field.name)
+ }
+ }
+
+ private def reverseRenameSourceColumn(df: DataFrame, field: StructField): Column = {
+ if (field.metadata.contains(MetadataKeys.SourceColumn)) {
+ val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn)
+ log.info(s"schema field : $sourceColumnName : reverse rename : ${field.name}")
+ df.col(field.name).as(sourceColumnName)
+ } else {
+ df.col(field.name)
+ }
+ }
+
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala
index 02ce0de5b..0fbec4d61 100644
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala
@@ -15,527 +15,38 @@
package za.co.absa.enceladus.standardization
-import java.io.{PrintWriter, StringWriter}
-import java.text.MessageFormat
-import java.time.Instant
-import java.util.UUID
-
-import com.typesafe.config.ConfigFactory
-import org.apache.spark.SPARK_VERSION
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.apache.spark.sql.{Column, DataFrame, DataFrameReader, SparkSession}
-import org.slf4j.LoggerFactory
-import za.co.absa.atum.AtumImplicits
-import za.co.absa.atum.core.Atum
-import za.co.absa.enceladus.common.RecordIdGeneration.{IdType, _}
-import za.co.absa.enceladus.common._
-import za.co.absa.enceladus.common.plugin.PostProcessingService
-import za.co.absa.enceladus.common.plugin.menas.{MenasPlugin, MenasRunUrl}
-import za.co.absa.enceladus.common.version.SparkVersionGuard
+import org.apache.spark.sql.SparkSession
import za.co.absa.enceladus.dao.MenasDAO
-import za.co.absa.enceladus.dao.auth.MenasCredentials
-import za.co.absa.enceladus.dao.rest.{MenasConnectionStringParser, RestDaoFactory}
-import za.co.absa.enceladus.model.Dataset
-import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter
-import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator
-import za.co.absa.enceladus.utils.config.{ConfigReader, SecureConfig}
+import za.co.absa.enceladus.dao.rest.RestDaoFactory
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils
-import za.co.absa.enceladus.utils.general.ProjectMetadataTools
-import za.co.absa.enceladus.utils.performance.{PerformanceMeasurer, PerformanceMetricTools}
-import za.co.absa.enceladus.utils.schema.{MetadataKeys, SchemaUtils, SparkUtils}
-import za.co.absa.enceladus.utils.time.TimeZoneNormalizer
+import za.co.absa.enceladus.utils.modules.SourcePhase
import za.co.absa.enceladus.utils.udf.UDFLibrary
-import za.co.absa.enceladus.utils.unicode.ParameterConversion._
-import za.co.absa.enceladus.utils.validation.ValidationException
-
-import scala.collection.immutable.HashMap
-import scala.util.control.NonFatal
-import scala.util.{Failure, Success, Try}
-object StandardizationJob {
- TimeZoneNormalizer.normalizeJVMTimeZone()
-
- private val log = LoggerFactory.getLogger(this.getClass)
- private val conf = ConfigFactory.load()
- private val confReader: ConfigReader = new ConfigReader(conf)
- private val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri"))
- private final val SparkCSVReaderMaxColumnsDefault: Int = 20480
+object StandardizationJob extends StandardizationExecution {
def main(args: Array[String]) {
- // This should be the first thing the app does to make secure Kafka work with our CA.
- // After Spring activates JavaX, it will be too late.
- SecureConfig.setSecureKafkaProperties(conf)
-
- SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION)
+ initialValidation()
- confReader.logEffectiveConfigProps(Constants.ConfigKeysToRedact)
-
- implicit val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args)
+ implicit val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args)
implicit val spark: SparkSession = obtainSparkSession()
implicit val fsUtils: FileSystemVersionUtils = new FileSystemVersionUtils(spark.sparkContext.hadoopConfiguration)
implicit val udfLib: UDFLibrary = new UDFLibrary
val menasCredentials = cmd.menasCredentialsFactory.getInstance()
implicit val dao: MenasDAO = RestDaoFactory.getInstance(menasCredentials, menasBaseUrls)
- dao.authenticate()
-
- val dataset = dao.getDataset(cmd.datasetName, cmd.datasetVersion)
- val schema: StructType = dao.getSchema(dataset.schemaName, dataset.schemaVersion)
- val reportVersion = getReportVersion(cmd, dataset)
- val pathCfg = getPathCfg(cmd, dataset, reportVersion)
- val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf)
-
- log.info(s"input path: ${pathCfg.inputPath}")
- log.info(s"output path: ${pathCfg.outputPath}")
- // die if the output path exists
- if (fsUtils.hdfsExists(pathCfg.outputPath)) {
- throw new IllegalStateException(
- s"Path ${pathCfg.outputPath} already exists. Increment the run version, or delete ${pathCfg.outputPath}"
- )
- }
-
- // Enable Spline
- import za.co.absa.spline.core.SparkLineageInitializer._
- spark.enableLineageTracking()
-
- // Enable Control Framework
- enableControlFramework(pathCfg, cmd, reportVersion)
-
- // init performance measurer
- val performance = new PerformanceMeasurer(spark.sparkContext.appName)
- val dfAll: DataFrame = prepareDataFrame(schema, cmd, pathCfg.inputPath, dataset)
-
- try {
- executeStandardization(performance, dfAll, schema, cmd, menasCredentials, pathCfg, recordIdGenerationStrategy)
- cmd.performanceMetricsFile.foreach(this.writePerformanceMetrics(performance, _))
- log.info("Standardization finished successfully")
-
- // read written data from parquet directly
- val standardizedDf = spark.read.parquet(pathCfg.outputPath)
- val postProcessingService = getPostProcessingService(cmd, pathCfg, dataset, MenasPlugin.runNumber, Atum.getControlMeasure.runUniqueId)
- postProcessingService.onSaveOutput(standardizedDf) // all enabled postProcessors will be run with the std df
- } finally {
- postStandardizationSteps(cmd)
- }
- }
-
- private def getPostProcessingService(cmd: StdCmdConfig, pathCfg: PathCfg, dataset: Dataset,
- runNumber: Option[Int], uniqueRunId: Option[String]
- )(implicit fsUtils: FileSystemVersionUtils): PostProcessingService = {
- val runId = MenasPlugin.runNumber
-
- if (runId.isEmpty) {
- log.warn("No run number found, the Run URL cannot be properly reported!")
- }
-
- // reporting the UI url(s) - if more than one, its comma-separated
- val runUrl: Option[String] = runId.map { runNumber =>
- menasBaseUrls.map { menasBaseUrl =>
- MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, dataset.name, dataset.version, runNumber)
- }.mkString(",")
- }
-
- PostProcessingService.forStandardization(conf, dataset.name, dataset.version, cmd.reportDate,
- getReportVersion(cmd, dataset), pathCfg.outputPath, Atum.getControlMeasure.metadata.sourceApplication, runUrl,
- runId, uniqueRunId, Instant.now)
- }
-
- private def getReportVersion(cmd: StdCmdConfig, dataset: Dataset)(implicit fsUtils: FileSystemVersionUtils): Int = {
- cmd.reportVersion match {
- case Some(version) => version
- case None =>
- val newVersion = fsUtils.getLatestVersion(dataset.hdfsPublishPath, cmd.reportDate) + 1
- log.warn(s"Report version not provided, inferred report version: $newVersion")
- log.warn("This is an EXPERIMENTAL feature.")
- log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.")
- log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.")
- newVersion
- }
- }
-
- private def getPathCfg(cmd: StdCmdConfig, dataset: Dataset, reportVersion: Int): PathCfg = {
- val dateTokens = cmd.reportDate.split("-")
- PathCfg(
- inputPath = buildRawPath(cmd, dataset, dateTokens, reportVersion),
- outputPath = MessageFormat.format(conf.getString("standardized.hdfs.path"),
- cmd.datasetName,
- cmd.datasetVersion.toString,
- cmd.reportDate,
- reportVersion.toString)
- )
- }
-
- private def obtainSparkSession()(implicit cmd: StdCmdConfig): SparkSession = {
- val enceladusVersion = ProjectMetadataTools.getEnceladusVersion
- log.info(s"Enceladus version $enceladusVersion")
- val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("")
- val spark = SparkSession.builder()
- .appName(s"Standardisation $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion")
- .getOrCreate()
- TimeZoneNormalizer.normalizeSessionTimeZone(spark)
- spark
- }
-
- /**
- * Returns a Spark reader with all format-specific options applied.
- * Options are provided by command line parameters.
- *
- * @param cmd Command line parameters containing format-specific options
- * @param dataset A dataset definition
- * @param numberOfColumns (Optional) number of columns, enables reading CSV files with the number of columns
- * larger than Spark default
- * @return The updated dataframe reader
- */
- def getFormatSpecificReader(cmd: StdCmdConfig, dataset: Dataset, numberOfColumns: Int = 0)
- (implicit spark: SparkSession, dao: MenasDAO): DataFrameReader = {
- val dfReader = spark.read.format(cmd.rawFormat)
- // applying format specific options
- val options = getCobolOptions(cmd, dataset) ++
- getGenericOptions(cmd) ++
- getXmlOptions(cmd) ++
- getCsvOptions(cmd, numberOfColumns) ++
- getFixedWidthOptions(cmd)
-
- // Applying all the options
- options.foldLeft(dfReader) { (df, optionPair) =>
- optionPair match {
- case (key, Some(value)) =>
- value match {
- // Handle all .option() overloads
- case StringParameter(s) => df.option(key, s)
- case BooleanParameter(b) => df.option(key, b)
- case LongParameter(l) => df.option(key, l)
- case DoubleParameter(d) => df.option(key, d)
- }
- case (_, None) => df
- }
- }
- }
-
- private def getGenericOptions(cmd: StdCmdConfig): HashMap[String, Option[RawFormatParameter]] = {
- val mode = if (cmd.failOnInputNotPerSchema) {
- "FAILFAST"
- } else {
- "PERMISSIVE"
- }
- HashMap(
- "charset" -> cmd.charset.map(StringParameter),
- "mode" -> Option(StringParameter(mode))
- )
- }
-
- private def getXmlOptions(cmd: StdCmdConfig): HashMap[String, Option[RawFormatParameter]] = {
- if (cmd.rawFormat.equalsIgnoreCase("xml")) {
- HashMap("rowtag" -> cmd.rowTag.map(StringParameter))
- } else {
- HashMap()
- }
- }
-
- private def getCsvOptions(cmd: StdCmdConfig, numberOfColumns: Int = 0): HashMap[String, Option[RawFormatParameter]] = {
- if (cmd.rawFormat.equalsIgnoreCase("csv")) {
- HashMap(
- "delimiter" -> cmd.csvDelimiter.map(s => StringParameter(s.includingUnicode.includingNone)),
- "header" -> cmd.csvHeader.map(BooleanParameter),
- "quote" -> cmd.csvQuote.map(s => StringParameter(s.includingUnicode.includingNone)),
- "escape" -> cmd.csvEscape.map(s => StringParameter(s.includingUnicode.includingNone)),
- // increase the default limit on the number of columns if needed
- // default is set at org.apache.spark.sql.execution.datasources.csv.CSVOptions maxColumns
- "maxColumns" -> {if (numberOfColumns > SparkCSVReaderMaxColumnsDefault) Some(LongParameter(numberOfColumns)) else None}
- )
- } else {
- HashMap()
- }
- }
-
- private def getFixedWidthOptions(cmd: StdCmdConfig): HashMap[String, Option[RawFormatParameter]] = {
- if (cmd.rawFormat.equalsIgnoreCase("fixed-width")) {
- HashMap("trimValues" -> cmd.fixedWidthTrimValues.map(BooleanParameter))
- } else {
- HashMap()
- }
- }
-
- private def getCobolOptions(cmd: StdCmdConfig, dataset: Dataset)(implicit dao: MenasDAO): HashMap[String, Option[RawFormatParameter]] = {
- if (cmd.rawFormat.equalsIgnoreCase("cobol")) {
- val cobolOptions = cmd.cobolOptions.getOrElse(CobolOptions())
- val isXcomOpt = if (cobolOptions.isXcom) Some(true) else None
- val isTextOpt = if (cobolOptions.isText) Some(true) else None
- val isAscii = cobolOptions.encoding.exists(_.equalsIgnoreCase("ascii"))
- // For ASCII files --charset is converted into Cobrix "ascii_charset" option
- // For EBCDIC files --charset is converted into Cobrix "ebcdic_code_page" option
- HashMap(
- getCopybookOption(cobolOptions, dataset),
- "is_xcom" -> isXcomOpt.map(BooleanParameter),
- "is_text" -> isTextOpt.map(BooleanParameter),
- "string_trimming_policy" -> cobolOptions.trimmingPolicy.map(StringParameter),
- "encoding" -> cobolOptions.encoding.map(StringParameter),
- "ascii_charset" -> cmd.charset.flatMap(charset => if (isAscii) Option(StringParameter(charset)) else None),
- "ebcdic_code_page" -> cmd.charset.flatMap(charset => if (!isAscii) Option(StringParameter(charset)) else None),
- "schema_retention_policy" -> Some(StringParameter("collapse_root"))
- )
- } else {
- HashMap()
- }
- }
-
- private def getCopybookOption(opts: CobolOptions, dataset: Dataset)(implicit dao: MenasDAO): (String, Option[RawFormatParameter]) = {
- val copybook = opts.copybook
- if (copybook.isEmpty) {
- log.info("Copybook location is not provided via command line - fetching the copybook attached to the schema...")
- val copybookContents = dao.getSchemaAttachment(dataset.schemaName, dataset.schemaVersion)
- log.info(s"Applying the following copybook:\n$copybookContents")
- ("copybook_contents", Option(StringParameter(copybookContents)))
- } else {
- log.info(s"Use copybook at $copybook")
- ("copybook", Option(StringParameter(copybook)))
- }
- }
-
- private def prepareDataFrame(schema: StructType,
- cmd: StdCmdConfig,
- path: String,
- dataset: Dataset)
- (implicit spark: SparkSession,
- fsUtils: FileSystemVersionUtils,
- dao: MenasDAO): DataFrame = {
- val numberOfColumns = schema.fields.length
- val dfReaderConfigured = getFormatSpecificReader(cmd, dataset, numberOfColumns)
-
- val readerWithOptSchema = cmd.rawFormat.toLowerCase() match {
- case "parquet" | "cobol" =>
- dfReaderConfigured
- case _ =>
- val optColumnNameOfCorruptRecord = getColumnNameOfCorruptRecord(schema, cmd)
- val inputSchema = PlainSchemaGenerator.generateInputSchema(schema, optColumnNameOfCorruptRecord)
- dfReaderConfigured.schema(inputSchema)
- }
-
- val dfWithSchema = readerWithOptSchema.load(s"$path/*")
- ensureSplittable(dfWithSchema, path, schema)
- }
-
- private def getColumnNameOfCorruptRecord(schema: StructType, cmd: StdCmdConfig)
- (implicit spark: SparkSession): Option[String] = {
- // SparkUtils.setUniqueColumnNameOfCorruptRecord is called even if result is not used to avoid conflict
- val columnNameOfCorruptRecord = SparkUtils.setUniqueColumnNameOfCorruptRecord(spark, schema)
- if (cmd.rawFormat.equalsIgnoreCase("fixed-width") || cmd.failOnInputNotPerSchema) {
- None
- } else {
- Option(columnNameOfCorruptRecord)
- }
- }
-
- //scalastyle:off parameter.number
- private def executeStandardization(performance: PerformanceMeasurer,
- dfAll: DataFrame,
- schema: StructType,
- cmd: StdCmdConfig,
- menasCredentials: MenasCredentials,
- pathCfg: PathCfg,
- recordIdGenerationStrategy: IdType)
- (implicit spark: SparkSession, udfLib: UDFLibrary, fsUtils: FileSystemVersionUtils): Unit = {
- //scalastyle:on parameter.number
- val rawDirSize: Long = fsUtils.getDirectorySize(pathCfg.inputPath)
- performance.startMeasurement(rawDirSize)
-
- handleControlInfoValidation()
-
- PerformanceMetricTools.addJobInfoToAtumMetadata("std", pathCfg.inputPath, pathCfg.outputPath,
- menasCredentials.username, cmd.cmdLineArgs.mkString(" "))
- val standardizedDF = try {
- StandardizationInterpreter.standardize(dfAll, schema, cmd.rawFormat, cmd.failOnInputNotPerSchema, recordIdGenerationStrategy)
- } catch {
- case e@ValidationException(msg, errors) =>
- AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Schema Validation", s"$msg\nDetails: ${
- errors.mkString("\n")
- }", "")
- throw e
- case NonFatal(e) if !e.isInstanceOf[ValidationException] =>
- val sw = new StringWriter
- e.printStackTrace(new PrintWriter(sw))
- AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Standardization", e.getMessage, sw.toString)
- throw e
- }
-
- //register renames with ATUM
- import za.co.absa.atum.AtumImplicits._
- val fieldRenames = SchemaUtils.getRenamesInSchema(schema)
- fieldRenames.foreach {
- case (destinationName, sourceName) => standardizedDF.registerColumnRename(sourceName, destinationName)
- }
-
- standardizedDF.setCheckpoint("Standardization - End", persistInDatabase = false)
-
- val recordCount = standardizedDF.lastCheckpointRowCount match {
- case None => standardizedDF.count
- case Some(p) => p
- }
- if (recordCount == 0) { handleEmptyOutputAfterStandardization() }
-
- standardizedDF.write.parquet(pathCfg.outputPath)
- // Store performance metrics
- // (record count, directory sizes, elapsed time, etc. to _INFO file metadata and performance file)
- val stdDirSize = fsUtils.getDirectorySize(pathCfg.outputPath)
- performance.finishMeasurement(stdDirSize, recordCount)
- cmd.rowTag.foreach(rowTag => Atum.setAdditionalInfo("xml_row_tag" -> rowTag))
- if (cmd.csvDelimiter.isDefined) {
- cmd.csvDelimiter.foreach(delimiter => Atum.setAdditionalInfo("csv_delimiter" -> delimiter))
- }
- PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(spark, "std", pathCfg.inputPath, pathCfg.outputPath,
- menasCredentials.username, cmd.cmdLineArgs.mkString(" "))
- standardizedDF.writeInfoFile(pathCfg.outputPath)
- }
-
- private def handleControlInfoValidation(): Unit = {
- ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match {
- case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) => {
- val confEntry = "control.info.validation"
- conf.getString(confEntry) match {
- case "strict" => throw ex
- case "warning" => log.warn(ex.msg)
- case "none" =>
- case _ => throw new RuntimeException(s"Invalid $confEntry value")
- }
- }
- case Failure(ex) => throw ex
- case Success(_) =>
- }
- }
+ val preparationResult = prepareJob()
+ val schema = prepareStandardization(args, menasCredentials, preparationResult)
+ val inputData = readStandardizationInputData(schema, cmd, preparationResult.pathCfg.inputPath, preparationResult.dataset)
- private def handleEmptyOutputAfterStandardization()(implicit spark: SparkSession): Unit = {
- import za.co.absa.atum.core.Constants._
-
- val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints
- .flatMap(checkpoint =>
- checkpoint.controls.filter(control =>
- control.controlName.equalsIgnoreCase(controlTypeRecordCount)))
- .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D))
-
- if (areCountMeasurementsAllZero) {
- log.warn("Empty output after running Standardization. Previous checkpoints show this is correct.")
- } else {
- val errMsg = "Empty output after running Standardization, while previous checkpoints show non zero record count"
- AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Standardization", errMsg, "")
- throw new IllegalStateException(errMsg)
- }
- }
-
- private def ensureSplittable(df: DataFrame, path: String, schema: StructType)
- (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = {
- if (fsUtils.isNonSplittable(path)) {
- convertToSplittable(df, path, schema)
- } else {
- df
- }
- }
-
- private def convertToSplittable(df: DataFrame, path: String, schema: StructType)
- (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = {
- log.warn("Dataset is stored in a non-splittable format. This can have a severe performance impact.")
-
- val tempParquetDir = s"/tmp/nonsplittable-to-parquet-${UUID.randomUUID()}"
- log.warn(s"Converting to Parquet in temporary dir: $tempParquetDir")
-
- // Handle renaming of source columns in case there are columns
- // that will break because of issues in column names like spaces
- df.select(schema.fields.map { field: StructField =>
- renameSourceColumn(df, field)
- }: _*).write.parquet(tempParquetDir)
-
- fsUtils.deleteOnExit(tempParquetDir)
- // Reload from temp parquet and reverse column renaming above
- val dfTmp = spark.read.parquet(tempParquetDir)
- dfTmp.select(schema.fields.map { field: StructField =>
- reverseRenameSourceColumn(dfTmp, field)
- }: _*)
- }
-
- private def renameSourceColumn(df: DataFrame, field: StructField): Column = {
- if (field.metadata.contains(MetadataKeys.SourceColumn)) {
- val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn)
- log.info(s"schema field : ${field.name} : rename : $sourceColumnName")
- df.col(sourceColumnName).as(field.name, field.metadata)
- } else {
- df.col(field.name)
- }
- }
-
- private def reverseRenameSourceColumn(df: DataFrame, field: StructField): Column = {
- if (field.metadata.contains(MetadataKeys.SourceColumn)) {
- val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn)
- log.info(s"schema field : $sourceColumnName : reverse rename : ${field.name}")
- df.col(field.name).as(sourceColumnName)
- } else {
- df.col(field.name)
- }
- }
-
- private def enableControlFramework(pathCfg: PathCfg, cmd: StdCmdConfig, reportVersion: Int)
- (implicit spark: SparkSession, dao: MenasDAO): Unit = {
- // Enable Control Framework
- import za.co.absa.atum.AtumImplicits.SparkSessionWrapper
- spark.enableControlMeasuresTracking(s"${pathCfg.inputPath}/_INFO").setControlMeasuresWorkflow("Standardization")
-
- // Enable control framework performance optimization for pipeline-like jobs
- Atum.setAllowUnpersistOldDatasets(true)
-
- // Enable non-default persistence storage level if provided in the command line
- cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel)
-
- // Enable Menas plugin for Control Framework
- MenasPlugin.enableMenas(
- conf,
- cmd.datasetName,
- cmd.datasetVersion,
- cmd.reportDate,
- reportVersion,
- isJobStageOnly = true,
- generateNewRun = true)
-
- // Add report date and version (aka Enceladus info date and version) to Atum's metadata
- Atum.setAdditionalInfo(Constants.InfoDateColumn -> cmd.reportDate)
- Atum.setAdditionalInfo(Constants.InfoVersionColumn -> reportVersion.toString)
-
- // Add the raw format of the input file(s) to Atum's metadta as well
- Atum.setAdditionalInfo("raw_format" -> cmd.rawFormat)
- }
-
- private def writePerformanceMetrics(performance: PerformanceMeasurer, fileName: String): Unit = {
try {
- performance.writeMetricsToFile(fileName)
- } catch {
- case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}")
- }
- }
+ val result = standardize(inputData, schema, cmd)
- private def postStandardizationSteps(cmd: StdCmdConfig): Unit = {
- Atum.getControlMeasure.runUniqueId
+ processStandardizationResult(args, result, preparationResult, schema, cmd, menasCredentials)
- val name = cmd.datasetName
- val version = cmd.datasetVersion
- MenasPlugin.runNumber.foreach { runNumber =>
- menasBaseUrls.foreach { menasBaseUrl =>
- val apiUrl = MenasRunUrl.getMenasApiRunUrl(menasBaseUrl, name, version, runNumber)
- val uiUrl = MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, name, version, runNumber)
-
- log.info(s"Menas API Run URL: $apiUrl")
- log.info(s"Menas UI Run URL: $uiUrl")
- }
- }
- }
-
- def buildRawPath(cmd: StdCmdConfig, dataset: Dataset, dateTokens: Array[String], reportVersion: Int): String = {
- cmd.rawPathOverride match {
- case None =>
- val folderSuffix = s"/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v$reportVersion"
- cmd.folderPrefix match {
- case None => s"${dataset.hdfsPath}$folderSuffix"
- case Some(folderPrefix) => s"${dataset.hdfsPath}/$folderPrefix$folderSuffix"
- }
- case Some(rawPathOverride) => rawPathOverride
+ runPostProcessing(SourcePhase.Standardization, preparationResult, cmd)
+ } finally {
+ finishJob(cmd)
}
}
-
- private final case class PathCfg(inputPath: String, outputPath: String)
}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationPropertiesProvider.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationPropertiesProvider.scala
new file mode 100644
index 000000000..832cfda1d
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationPropertiesProvider.scala
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.standardization
+
+import org.apache.spark.sql.{DataFrameReader, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+import za.co.absa.enceladus.common._
+import za.co.absa.enceladus.dao.MenasDAO
+import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
+import za.co.absa.enceladus.utils.unicode.ParameterConversion._
+
+import scala.collection.immutable.HashMap
+
+/**
+ * Reads standardization properties from the configuration file
+ */
+class StandardizationPropertiesProvider {
+ private val log: Logger = LoggerFactory.getLogger(this.getClass)
+ private final val SparkCSVReaderMaxColumnsDefault: Int = 20480
+
+ /**
+ * Returns a Spark reader with all format-specific options applied.
+ * Options are provided by command line parameters.
+ *
+ * @param cmd Command line parameters containing format-specific options
+ * @param dataset A dataset definition
+ * @param numberOfColumns (Optional) number of columns, enables reading CSV files with the number of columns
+ * larger than Spark default
+ * @return The updated dataframe reader
+ */
+ def getFormatSpecificReader(cmd: StandardizationConfig, dataset: Dataset, numberOfColumns: Int = 0)
+ (implicit spark: SparkSession, dao: MenasDAO): DataFrameReader = {
+ val dfReader = spark.read.format(cmd.rawFormat)
+ // applying format specific options
+ val options = getCobolOptions(cmd, dataset) ++
+ getGenericOptions(cmd) ++
+ getXmlOptions(cmd) ++
+ getCsvOptions(cmd, numberOfColumns) ++
+ getFixedWidthOptions(cmd)
+
+ // Applying all the options
+ options.foldLeft(dfReader) { (df, optionPair) =>
+ optionPair match {
+ case (key, Some(value)) =>
+ value match {
+ // Handle all .option() overloads
+ case StringParameter(s) => df.option(key, s)
+ case BooleanParameter(b) => df.option(key, b)
+ case LongParameter(l) => df.option(key, l)
+ case DoubleParameter(d) => df.option(key, d)
+ }
+ case (_, None) => df
+ }
+ }
+ }
+
+ private def getGenericOptions(cmd: StandardizationConfig): HashMap[String, Option[RawFormatParameter]] = {
+ val mode = if (cmd.failOnInputNotPerSchema) {
+ "FAILFAST"
+ } else {
+ "PERMISSIVE"
+ }
+ HashMap(
+ "charset" -> cmd.charset.map(StringParameter),
+ "mode" -> Option(StringParameter(mode))
+ )
+ }
+
+ private def getXmlOptions(cmd: StandardizationConfig): HashMap[String, Option[RawFormatParameter]] = {
+ if (cmd.rawFormat.equalsIgnoreCase("xml")) {
+ HashMap("rowtag" -> cmd.rowTag.map(StringParameter))
+ } else {
+ HashMap()
+ }
+ }
+
+ private def getCsvOptions(cmd: StandardizationConfig, numberOfColumns: Int = 0): HashMap[String, Option[RawFormatParameter]] = {
+ if (cmd.rawFormat.equalsIgnoreCase("csv")) {
+ HashMap(
+ "delimiter" -> cmd.csvDelimiter.map(s => StringParameter(s.includingUnicode.includingNone)),
+ "header" -> cmd.csvHeader.map(BooleanParameter),
+ "quote" -> cmd.csvQuote.map(s => StringParameter(s.includingUnicode.includingNone)),
+ "escape" -> cmd.csvEscape.map(s => StringParameter(s.includingUnicode.includingNone)),
+ // increase the default limit on the number of columns if needed
+ // default is set at org.apache.spark.sql.execution.datasources.csv.CSVOptions maxColumns
+ "maxColumns" -> {
+ if (numberOfColumns > SparkCSVReaderMaxColumnsDefault) Some(LongParameter(numberOfColumns)) else None
+ }
+ )
+ } else {
+ HashMap()
+ }
+ }
+
+ private def getFixedWidthOptions(cmd: StandardizationConfig): HashMap[String, Option[RawFormatParameter]] = {
+ if (cmd.rawFormat.equalsIgnoreCase("fixed-width")) {
+ HashMap("trimValues" -> cmd.fixedWidthTrimValues.map(BooleanParameter))
+ } else {
+ HashMap()
+ }
+ }
+
+ private def getCobolOptions(cmd: StandardizationConfig, dataset: Dataset)(implicit dao: MenasDAO): HashMap[String, Option[RawFormatParameter]] = {
+ if (cmd.rawFormat.equalsIgnoreCase("cobol")) {
+ val cobolOptions = cmd.cobolOptions.getOrElse(CobolOptions())
+ val isXcomOpt = if (cobolOptions.isXcom) Some(true) else None
+ val isTextOpt = if (cobolOptions.isText) Some(true) else None
+ val isAscii = cobolOptions.encoding.exists(_.equalsIgnoreCase("ascii"))
+ // For ASCII files --charset is converted into Cobrix "ascii_charset" option
+ // For EBCDIC files --charset is converted into Cobrix "ebcdic_code_page" option
+ HashMap(
+ getCopybookOption(cobolOptions, dataset),
+ "is_xcom" -> isXcomOpt.map(BooleanParameter),
+ "is_text" -> isTextOpt.map(BooleanParameter),
+ "string_trimming_policy" -> cobolOptions.trimmingPolicy.map(StringParameter),
+ "encoding" -> cobolOptions.encoding.map(StringParameter),
+ "ascii_charset" -> cmd.charset.flatMap(charset => if (isAscii) Option(StringParameter(charset)) else None),
+ "ebcdic_code_page" -> cmd.charset.flatMap(charset => if (!isAscii) Option(StringParameter(charset)) else None),
+ "schema_retention_policy" -> Some(StringParameter("collapse_root"))
+ )
+ } else {
+ HashMap()
+ }
+ }
+
+ private def getCopybookOption(opts: CobolOptions, dataset: Dataset)(implicit dao: MenasDAO): (String, Option[RawFormatParameter]) = {
+ val copybook = opts.copybook
+ if (copybook.isEmpty) {
+ log.info("Copybook location is not provided via command line - fetching the copybook attached to the schema...")
+ val copybookContents = dao.getSchemaAttachment(dataset.schemaName, dataset.schemaVersion)
+ log.info(s"Applying the following copybook:\n$copybookContents")
+ ("copybook_contents", Option(StringParameter(copybookContents)))
+ } else {
+ log.info(s"Use copybook at $copybook")
+ ("copybook", Option(StringParameter(copybook)))
+ }
+ }
+
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StdCmdConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StdCmdConfig.scala
deleted file mode 100644
index ce7dabd34..000000000
--- a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StdCmdConfig.scala
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright 2018 ABSA Group Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package za.co.absa.enceladus.standardization
-
-import org.apache.spark.storage.StorageLevel
-import scopt.OptionParser
-import za.co.absa.enceladus.dao.auth._
-
-import scala.util.matching.Regex
-
-/**
- * This is a class for configuration provided by the command line parameters
- *
- * Note: scopt requires all fields to have default values.
- * Even if a field is mandatory it needs a default value.
- */
-case class StdCmdConfig(
- cmdLineArgs: Array[String],
- datasetName: String = "",
- datasetVersion: Int = 1,
- reportDate: String = "",
- reportVersion: Option[Int] = None,
- rawFormat: String = "xml",
- menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory,
- charset: Option[String] = None,
- rowTag: Option[String] = None,
- csvDelimiter: Option[String] = None,
- csvHeader: Option[Boolean] = Some(false),
- csvQuote: Option[String] = None,
- csvEscape: Option[String] = None,
- cobolOptions: Option[CobolOptions] = None,
- fixedWidthTrimValues: Option[Boolean] = Some(false),
- performanceMetricsFile: Option[String] = None,
- rawPathOverride: Option[String] = None,
- folderPrefix: Option[String] = None,
- persistStorageLevel: Option[StorageLevel] = None,
- failOnInputNotPerSchema: Boolean = false
- )
-
-object StdCmdConfig {
-
- def getCmdLineArguments(args: Array[String]): StdCmdConfig = {
- val parser = new CmdParser("spark-submit [spark options] StandardizationBundle.jar")
-
- val optionCmd = parser.parse(args, StdCmdConfig(args))
- if (optionCmd.isEmpty) {
- // Wrong arguments provided, the message is already displayed
- System.exit(1)
- }
- optionCmd.get
- }
-
- private class CmdParser(programName: String) extends OptionParser[StdCmdConfig](programName) {
- head("\nStandardization", "")
- var rawFormat: Option[String] = None
-
- opt[String]('D', "dataset-name").required().action((value, config) =>
- config.copy(datasetName = value)).text("Dataset name")
-
- opt[Int]('d', "dataset-version").required().action((value, config) =>
- config.copy(datasetVersion = value)).text("Dataset version")
- .validate(value =>
- if (value > 0) {
- success
- } else {
- failure("Option --dataset-version must be >0")
- })
-
- val reportDateMatcher: Regex = "^\\d{4}-\\d{2}-\\d{2}$".r
- opt[String]('R', "report-date").required().action((value, config) =>
- config.copy(reportDate = value)).text("Report date in 'yyyy-MM-dd' format")
- .validate(value =>
- reportDateMatcher.findFirstIn(value) match {
- case None => failure(s"Match error in '$value'. Option --report-date expects a date in 'yyyy-MM-dd' format")
- case _ => success
- })
-
- private var credsFile: Option[String] = None
- private var keytabFile: Option[String] = None
- opt[String]("menas-credentials-file").hidden.optional().action({ (file, config) =>
- credsFile = Some(file)
- config.copy(menasCredentialsFactory = new MenasPlainCredentialsFactory(file))
- }).text("Path to Menas credentials config file.").validate(path =>
- if (keytabFile.isDefined) {
- failure("Only one authentication method is allow at a time")
- } else {
- success
- })
-
- opt[String]("menas-auth-keytab").optional().action({ (file, config) =>
- keytabFile = Some(file)
- config.copy(menasCredentialsFactory = new MenasKerberosCredentialsFactory(file))
- }).text("Path to keytab file used for authenticating to menas").validate({ file =>
- if (credsFile.isDefined) {
- failure("Only one authentication method is allowed at a time")
- } else {
- success
- }
- })
-
- opt[Int]('r', "report-version").optional().action((value, config) =>
- config.copy(reportVersion = Some(value)))
- .text("Report version. If not provided, it is inferred based on the publish path (it's an EXPERIMENTAL feature)")
- .validate(value =>
- if (value > 0) {
- success
- } else {
- failure("Option --report-version must be >0")
- })
-
- opt[String]('f', "raw-format").required().action((value, config) => {
- rawFormat = Some(value)
- config.copy(rawFormat = value)
- }).text("format of the raw data (csv, xml, parquet,fixed-width, etc.)")
-
- opt[String]("charset").optional().action((value, config) =>
- config.copy(charset = Some(value))).text("use the specific charset (default is UTF-8)")
- .validate(value =>
- if (rawFormat.isDefined &&
- (rawFormat.get.equalsIgnoreCase("xml") ||
- rawFormat.get.equalsIgnoreCase("csv") ||
- rawFormat.get.equalsIgnoreCase("json") ||
- rawFormat.get.equalsIgnoreCase("cobol"))) {
- success
- } else {
- failure("The --charset option is supported only for CSV, JSON, XML and COBOL")
- })
-
- opt[String]("row-tag").optional().action((value, config) =>
- config.copy(rowTag = Some(value))).text("use the specific row tag instead of 'ROW' for XML format")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("xml")) {
- success
- } else {
- failure("The --row-tag option is supported only for XML raw data format")
- })
-
- opt[String]("delimiter").optional().action((value, config) =>
- config.copy(csvDelimiter = Some(value))).text("use the specific delimiter instead of ',' for CSV format")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) {
- success
- } else {
- failure("The --delimiter option is supported only for CSV raw data format")
- })
-
- opt[String]("csv-quote").optional().action((value, config) =>
- config.copy(csvQuote = Some(value)))
- .text("use the specific quote character for creating CSV fields that may contain delimiter character(s) (default is '\"')")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) {
- success
- } else {
- failure("The --csv-quote option is supported only for CSV raw data format")
- })
-
- opt[String]("csv-escape").optional().action((value, config) =>
- config.copy(csvEscape = Some(value)))
- .text("use the specific escape character for CSV fields (default is '\\')")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) {
- success
- } else {
- failure("The --csv-escape option is supported only for CSV raw data format")
- })
-
- // no need for validation for boolean since scopt itself will do
- opt[Boolean]("header").optional().action((value, config) =>
- config.copy(csvHeader = Some(value))).text("use the header option to consider CSV header")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) {
- success
- } else {
- failure("The --header option is supported only for CSV ")
- })
-
- opt[Boolean]("trimValues").optional().action((value, config) =>
- config.copy(fixedWidthTrimValues = Some(value))).text("use --trimValues option to trim values in fixed width file")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("fixed-width")) {
- success
- } else {
- failure("The --trimValues option is supported only for fixed-width files ")
- })
-
- opt[Boolean]("strict-schema-check").optional().action((value, config) =>
- config.copy(failOnInputNotPerSchema = value))
- .text("use --strict-schema-check option to fail or proceed over rows not adhering to the schema (with error in errCol)")
-
- processCobolCmdOptions()
-
- opt[String]("performance-file").optional().action((value, config) =>
- config.copy(performanceMetricsFile = Some(value))).text("produce a performance metrics file at the given location (local filesystem)")
-
- opt[String]("debug-set-raw-path").optional().hidden().action((value, config) =>
- config.copy(rawPathOverride = Some(value))).text("override the path of the raw data (used internally for performance tests)")
-
- opt[String]("folder-prefix").optional().action((value, config) =>
- config.copy(folderPrefix = Some(value))).text("Adds a folder prefix before the date tokens")
-
- opt[String]("persist-storage-level").optional().action((value, config) =>
- config.copy(persistStorageLevel = Some(StorageLevel.fromString(value))))
- .text("Specifies persistence storage level to use when processing data. Spark's default is MEMORY_AND_DISK.")
-
- help("help").text("prints this usage text")
-
- checkConfig { config =>
- config.menasCredentialsFactory match {
- case InvalidMenasCredentialsFactory => failure("No authentication method specified (e.g. --menas-auth-keytab)")
- case _ => success
- }
- }
-
- private def processCobolCmdOptions(): Unit = {
- opt[String]("copybook").optional().action((value, config) => {
- config.copy(cobolOptions = cobolSetCopybook(config.cobolOptions, value))
- }).text("Path to a copybook for COBOL data format")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) {
- success
- } else {
- failure("The --copybook option is supported only for COBOL data format")
- }
- )
-
- opt[Boolean]("is-xcom").optional().action((value, config) => {
- config.copy(cobolOptions = cobolSetIsXcom(config.cobolOptions, value))
- }).text("Does a mainframe file in COBOL format contain XCOM record headers")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) {
- success
- } else {
- failure("The --is-xcom option is supported only for COBOL data format")
- })
-
- opt[Boolean]("cobol-is-text").optional().action((value, config) => {
- config.copy(cobolOptions = cobolSetIsText(config.cobolOptions, value))
- }).text("Specifies if the mainframe file is ASCII text file")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) {
- success
- } else {
- failure("The --cobol-is-text option is supported only for COBOL data format")
- })
-
- opt[String]("cobol-encoding").optional().action((value, config) => {
- config.copy(cobolOptions = cobolSetEncoding(config.cobolOptions, value))
- }).text("Specify encoding of mainframe files (ascii or ebcdic)")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) {
- success
- } else {
- failure("The --cobol-encoding option is supported only for COBOL data format")
- })
-
- opt[String]("cobol-trimming-policy").optional().action((value, config) => {
- config.copy(cobolOptions = cobolSetTrimmingPolicy(config.cobolOptions, value))
- }).text("Specify string trimming policy for mainframe files (none, left, right, both)")
- .validate(value =>
- if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) {
- success
- } else {
- failure("The --cobol-trimming-policy option is supported only for COBOL data format")
- })
- }
-
- private def cobolSetCopybook(cobolOptions: Option[CobolOptions], newCopybook: String): Option[CobolOptions] = {
- cobolOptions match {
- case Some(a) => Some(a.copy(copybook = newCopybook))
- case None => Some(CobolOptions(newCopybook))
- }
- }
-
- private def cobolSetIsText(cobolOptions: Option[CobolOptions], newIsText: Boolean): Option[CobolOptions] = {
- cobolOptions match {
- case Some(a) => Some(a.copy(isText = newIsText))
- case None => Some(CobolOptions(isText = newIsText))
- }
- }
-
- private def cobolSetIsXcom(cobolOptions: Option[CobolOptions], newIsXCom: Boolean): Option[CobolOptions] = {
- cobolOptions match {
- case Some(a) => Some(a.copy(isXcom = newIsXCom))
- case None => Some(CobolOptions(isXcom = newIsXCom))
- }
- }
-
- private def cobolSetEncoding(cobolOptions: Option[CobolOptions], newEncoding: String): Option[CobolOptions] = {
- cobolOptions match {
- case Some(a) => Some(a.copy(encoding = Option(newEncoding)))
- case None => Some(CobolOptions(encoding = Option(newEncoding)))
- }
- }
-
- private def cobolSetTrimmingPolicy(cobolOptions: Option[CobolOptions], newTrimmingPolicy: String): Option[CobolOptions] = {
- cobolOptions match {
- case Some(a) => Some(a.copy(trimmingPolicy = Option(newTrimmingPolicy)))
- case None => Some(CobolOptions(trimmingPolicy = Option(newTrimmingPolicy)))
- }
- }
- }
-
-}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationConfig.scala
new file mode 100644
index 000000000..9bb4bc0a3
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationConfig.scala
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.standardization.config
+
+import org.apache.spark.storage.StorageLevel
+import scopt.OParser
+import za.co.absa.enceladus.common.config.{ConfigError, JobConfigParser}
+import za.co.absa.enceladus.dao.auth.{InvalidMenasCredentialsFactory, MenasCredentialsFactory}
+import za.co.absa.enceladus.standardization.CobolOptions
+
+import scala.util.Try
+
+/**
+ * This is a class for configuration provided by the command line parameters
+ *
+ * Note: scopt requires all fields to have default values.
+ * Even if a field is mandatory it needs a default value.
+ */
+case class StandardizationConfig(rawFormat: String = "xml",
+ charset: Option[String] = None,
+ rowTag: Option[String] = None,
+ csvDelimiter: Option[String] = None,
+ csvHeader: Option[Boolean] = Some(false),
+ csvQuote: Option[String] = None,
+ csvEscape: Option[String] = None,
+ cobolOptions: Option[CobolOptions] = None,
+ fixedWidthTrimValues: Option[Boolean] = Some(false),
+ rawPathOverride: Option[String] = None,
+ failOnInputNotPerSchema: Boolean = false,
+ datasetName: String = "",
+ datasetVersion: Int = 1,
+ reportDate: String = "",
+ reportVersion: Option[Int] = None,
+ menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory,
+ performanceMetricsFile: Option[String] = None,
+ folderPrefix: Option[String] = None,
+ persistStorageLevel: Option[StorageLevel] = None,
+ credsFile: Option[String] = None,
+ keytabFile: Option[String] = None
+ )
+ extends StandardizationParser[StandardizationConfig]{
+ override def withRawFormat(value: String): StandardizationConfig = copy(rawFormat = value)
+ override def withCharset(value: Option[String]): StandardizationConfig = copy(charset = value)
+ override def withRowTag(value: Option[String]): StandardizationConfig = copy(rowTag = value)
+ override def withCsvDelimiter(value: Option[String]): StandardizationConfig = copy(csvDelimiter = value)
+ override def withCsvHeader(value: Option[Boolean]): StandardizationConfig = copy(csvHeader = value)
+ override def withCsvQuote(value: Option[String]): StandardizationConfig = copy(csvQuote = value)
+ override def withCsvEscape(value: Option[String]): StandardizationConfig = copy(csvEscape = value)
+ override def withCobolOptions(value: Option[CobolOptions]): StandardizationConfig = copy(cobolOptions = value)
+ override def withFixedWidthTrimValues(value: Option[Boolean]): StandardizationConfig = copy(fixedWidthTrimValues = value)
+ override def withRawPathOverride(value: Option[String]): StandardizationConfig = copy(rawPathOverride = value)
+ override def withFailOnInputNotPerSchema(value: Boolean): StandardizationConfig = copy(failOnInputNotPerSchema = value)
+
+ override def withDatasetName(value: String): StandardizationConfig = copy(datasetName = value)
+ override def withDatasetVersion(value: Int): StandardizationConfig = copy(datasetVersion = value)
+ override def withReportDate(value: String): StandardizationConfig = copy(reportDate = value)
+ override def withReportVersion(value: Option[Int]): StandardizationConfig = copy(reportVersion = value)
+ override def withCredsFile(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): StandardizationConfig = {
+ copy(credsFile = value, menasCredentialsFactory = menasCredentialsFactory)
+ }
+ override def withAuthKeytab(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): StandardizationConfig = {
+ copy(keytabFile = value, menasCredentialsFactory = menasCredentialsFactory)
+ }
+ override def withPerformanceMetricsFile(value: Option[String]): StandardizationConfig = copy(performanceMetricsFile = value)
+ override def withFolderPrefix(value: Option[String]): StandardizationConfig = copy(folderPrefix = value)
+ override def withPersistStorageLevel(value: Option[StorageLevel]): StandardizationConfig = copy(persistStorageLevel = value)
+}
+
+object StandardizationConfig {
+
+ def tryFromArguments(args: Array[String]): Try[StandardizationConfig] = {
+ import za.co.absa.enceladus.utils.implicits.OptionImplicits._
+ OParser.parse(standardizationJobParser, args, StandardizationConfig()).toTry(ConfigError("Command line parameters error"))
+ }
+
+ def getFromArguments(args: Array[String]): StandardizationConfig = tryFromArguments(args).get
+
+ private val standardizationJobParser: OParser[_, StandardizationConfig] = {
+ val builder = OParser.builder[StandardizationConfig]
+ import builder._
+ OParser.sequence(
+ programName("Standardization Job"),
+ head("Standardization", ""),
+ StandardizationParser.standardizationParser,
+ JobConfigParser.jobConfigParser
+ )
+ }
+}
diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationParser.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationParser.scala
new file mode 100644
index 000000000..8b1da3bd8
--- /dev/null
+++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationParser.scala
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.standardization.config
+
+import scopt.{OParser, OParserBuilder}
+import za.co.absa.enceladus.common.config.JobConfigParser
+import za.co.absa.enceladus.standardization.CobolOptions
+
+trait StandardizationParser[R] extends JobConfigParser[R] {
+ def withRawFormat(value: String): R
+ def withCharset(value: Option[String] = None): R
+ def withRowTag(value: Option[String] = None): R
+ def withCsvDelimiter(value: Option[String] = None): R
+ def withCsvHeader(value: Option[Boolean] = Some(false)): R
+ def withCsvQuote(value: Option[String] = None): R
+ def withCsvEscape(value: Option[String] = None): R
+ def withCobolOptions(value: Option[CobolOptions] = None): R
+ def withFixedWidthTrimValues(value: Option[Boolean] = Some(false)): R
+ def withRawPathOverride(value: Option[String]): R
+ def withFailOnInputNotPerSchema(value: Boolean): R
+
+ def rawFormat: String
+ def charset: Option[String]
+ def rowTag: Option[String]
+ def csvDelimiter: Option[String]
+ def csvHeader: Option[Boolean]
+ def csvQuote: Option[String]
+ def csvEscape: Option[String]
+ def cobolOptions: Option[CobolOptions]
+ def fixedWidthTrimValues: Option[Boolean]
+ def rawPathOverride: Option[String]
+ def failOnInputNotPerSchema: Boolean
+}
+
+object StandardizationParser {
+
+ //scalastyle:off method.length the length is legit for parsing input paramters
+ def standardizationParser[R <: StandardizationParser[R]]: OParser[_, R] = {
+ val builder = OParser.builder[R]
+ import builder._
+ OParser.sequence(
+ head("\nStandardization", ""),
+
+ opt[String]('f', "raw-format").required().action((value, config) => {
+ config.withRawFormat(value.toLowerCase())
+ }).text("format of the raw data (csv, xml, parquet, fixed-width, etc.)"),
+
+ opt[String]("charset").optional().action((value, config) =>
+ config.withCharset(Some(value))).text("use the specific charset (default is UTF-8)"),
+
+ opt[String]("row-tag").optional().action((value, config) =>
+ config.withRowTag(Some(value))).text("use the specific row tag instead of 'ROW' for XML format"),
+
+ opt[String]("delimiter").optional().action((value, config) =>
+ config.withCsvDelimiter(Some(value))).text("use the specific delimiter instead of ',' for CSV format"),
+
+ opt[String]("csv-quote").optional().action((value, config) =>
+ config.withCsvQuote(Some(value)))
+ .text("use the specific quote character for creating CSV fields that may contain delimiter character(s) (default is '\"')"),
+
+ opt[String]("csv-escape").optional().action((value, config) =>
+ config.withCsvEscape(Some(value)))
+ .text("use the specific escape character for CSV fields (default is '\\')"),
+
+ // no need for validation for boolean since scopt itself will do
+ opt[Boolean]("header").optional().action((value, config) =>
+ config.withCsvHeader(Some(value))).text("use the header option to consider CSV header"),
+
+ opt[Boolean]("trimValues").optional().action((value, config) =>
+ config.withFixedWidthTrimValues(Some(value))).text("use --trimValues option to trim values in fixed width file"),
+
+ opt[Boolean]("strict-schema-check").optional().action((value, config) =>
+ config.withFailOnInputNotPerSchema(value))
+ .text("use --strict-schema-check option to fail or proceed over rows not adhering to the schema (with error in errCol)"),
+
+ opt[String]("copybook").optional().action((value, config) => {
+ val newOptions = config.cobolOptions match {
+ case Some(a) => Some(a.copy(copybook = value))
+ case None => Some(CobolOptions(value))
+ }
+ config.withCobolOptions(newOptions)
+
+ }).text("Path to a copybook for COBOL data format"),
+
+ opt[Boolean]("is-xcom").optional().action((value, config) => {
+ val newOptions = config.cobolOptions match {
+ case Some(a) => Some(a.copy(isXcom = value))
+ case None => Some(CobolOptions(isXcom = value))
+ }
+ config.withCobolOptions(newOptions)
+ }).text("Does a mainframe file in COBOL format contain XCOM record headers"),
+
+ opt[Boolean]("cobol-is-text").optional().action((value, config) => {
+ val newOptions = config.cobolOptions match {
+ case Some(a) => Some(a.copy(isText = value))
+ case None => Some(CobolOptions(isText = value))
+ }
+ config.withCobolOptions(newOptions)
+ }).text("Specifies if the mainframe file is ASCII text file"),
+
+ opt[String]("cobol-encoding").optional().action((value, config) => {
+ val newOptions = config.cobolOptions match {
+ case Some(a) => Some(a.copy(encoding = Option(value)))
+ case None => Some(CobolOptions(encoding = Option(value)))
+ }
+ config.withCobolOptions(newOptions)
+ }).text("Specify encoding of mainframe files (ascii or ebcdic)"),
+
+ opt[String]("cobol-trimming-policy").optional().action((value, config) => {
+ val newOptions = config.cobolOptions match {
+ case Some(a) => Some(a.copy(trimmingPolicy = Option(value)))
+ case None => Some(CobolOptions(trimmingPolicy = Option(value)))
+ }
+ config.withCobolOptions(newOptions)
+ }).text("Specify string trimming policy for mainframe files (none, left, right, both)"),
+
+ opt[String]("debug-set-raw-path").optional().hidden().action((value, config) =>
+ config.withRawPathOverride(Some(value)))
+ .text("override the path of the raw data (used internally for performance tests)"),
+
+ checkConfig(checkConfigX(_, builder))
+ )
+ }
+ //scalastyle:on method.length
+
+ private val formatsSupportingCharset = List("xml", "csv", "json", "cobol")
+
+ private def typicalError(field: String, format: String): String = {
+ s"The $field option is supported only for $format format"
+ }
+
+ private def checkCharset[R <: StandardizationParser[R]](config: R): List[String] = {
+ if (!formatsSupportingCharset.contains(config.rawFormat) && config.charset.isDefined) {
+ List(typicalError("--charset", "CSV, JSON, XML and COBOL"))
+ } else {
+ List.empty
+ }
+ }
+
+ private def checkXMLFields[R <: StandardizationParser[R]](config: R): List[String] = {
+ if (config.rowTag.isDefined && config.rawFormat != "xml") {
+ List(typicalError("--row-tag", "XML raw data"))
+ } else {
+ List.empty
+ }
+ }
+
+ private def checkCSVFields[R <: StandardizationParser[R]](config: R): List[String] = {
+ def csvFieldsThatShouldNotBePresent(config: R): List[String] = {
+ val format = "CSV"
+ val definedFields = Map(
+ typicalError("--delimiter", format) -> config.csvDelimiter.isDefined,
+ typicalError("--escape", format) -> config.csvEscape.isDefined,
+ typicalError("--header", s"$format raw data") -> config.csvHeader.contains(true),
+ typicalError("--quote", format) -> config.csvQuote.isDefined
+ )
+ definedFields.filter { case (_, value) => value }.keys.toList
+ }
+
+ if (config.rawFormat == "csv") {
+ List.empty
+ } else {
+ csvFieldsThatShouldNotBePresent(config)
+ }
+ }
+
+ private def checkCobolFields[R <: StandardizationParser[R]](config: R): Seq[String] = {
+ def cobolFieldsThatShouldNotBePresent(cobolOptions: CobolOptions): List[String] = {
+ val format = "COBOL"
+ val definedFields = Map(
+ typicalError("--copybook", format) -> (cobolOptions.copybook != ""),
+ typicalError("--cobol-encoding", format) -> cobolOptions.encoding.isDefined,
+ typicalError("--is-xcom", format) -> cobolOptions.isXcom,
+ typicalError("--is-text", format) -> cobolOptions.isText
+ )
+ definedFields.filter { case (_, value) => value }.keys.toList
+ }
+
+
+ if (config.rawFormat == "cobol") {
+ List.empty
+ } else {
+ config.cobolOptions
+ .map(cobolFieldsThatShouldNotBePresent)
+ .getOrElse(List.empty)
+ }
+ }
+
+ private def checkConfigX[R <: StandardizationParser[R]](config: R, builder: OParserBuilder[R]): Either[String, Unit] = {
+ val allErrors:List[String] = checkCharset(config) ++
+ checkXMLFields(config) ++
+ checkCSVFields(config) ++
+ checkCobolFields(config)
+
+ if (allErrors.isEmpty) {
+ builder.success
+ } else {
+ builder.failure(allErrors.mkString("\n"))
+ }
+ }
+
+}
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/ConfConfigSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/config/ConformanceParserSuite.scala
similarity index 82%
rename from spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/ConfConfigSuite.scala
rename to spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/config/ConformanceParserSuite.scala
index 3f13b2b04..76d2eb664 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/ConfConfigSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/config/ConformanceParserSuite.scala
@@ -13,16 +13,17 @@
* limitations under the License.
*/
-package za.co.absa.enceladus.conformance
+package za.co.absa.enceladus.conformance.config
import java.time.ZonedDateTime
import org.scalatest.FunSuite
+import za.co.absa.enceladus.conformance.ConformanceExecution
import za.co.absa.enceladus.dao.auth.{MenasKerberosCredentials, MenasPlainCredentials}
import za.co.absa.enceladus.model.Dataset
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
-class ConfConfigSuite extends FunSuite with SparkTestBase {
+class ConformanceParserSuite extends FunSuite with SparkTestBase {
private val year = "2018"
private val month = "12"
@@ -49,11 +50,12 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
private val disabled = false
private val dateDisabled = None
private val userDisabled = None
- private val rawFormat = "parquet"
private val folderPrefix = s"year=$year/month=$month/day=$day"
private val infoDateColumn = "enceladus_info_date"
private val infoVersionColumn = "enceladus_info_version"
+ private object TestDynamicConformance extends ConformanceExecution
+
test("Test credentials file parsing "){
val credentials = MenasPlainCredentials.fromFile(menasCredentialsFile)
@@ -68,7 +70,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
}
test("folder-prefix parameter") {
- val cmdConfigNoFolderPrefix = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigNoFolderPrefix = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -86,7 +88,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
assert(cmdConfigNoFolderPrefix.publishPathOverride.isEmpty)
assert(actualPlainMenasCredentials === menasCredentials)
- val cmdConfigFolderPrefix = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigFolderPrefix = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -106,7 +108,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
assert(cmdConfigFolderPrefix.publishPathOverride.isEmpty)
assert(actualMenasKerberosCredentials === menasKeytab)
- val cmdConfigPublishPathOverrideAndFolderPrefix = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigPublishPathOverrideAndFolderPrefix = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -144,7 +146,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
userDisabled,
List()
)
- val cmdConfigNoFolderPrefix = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigNoFolderPrefix = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -152,7 +154,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
"--report-version", reportVersion.toString,
"--menas-credentials-file", menasCredentialsFile
))
- val cmdConfigFolderPrefix = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigFolderPrefix = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -160,7 +162,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
"--report-version", reportVersion.toString,
"--menas-credentials-file", menasCredentialsFile,
"--folder-prefix", folderPrefix))
- val cmdConfigPublishPathOverride = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigPublishPathOverride = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -168,7 +170,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
"--report-version", reportVersion.toString,
"--menas-credentials-file", menasCredentialsFile,
"--debug-set-publish-path", hdfsPublishPathOverride))
- val cmdConfigPublishPathOverrideAndFolderPrefix = ConfCmdConfig.getCmdLineArguments(
+ val cmdConfigPublishPathOverrideAndFolderPrefix = ConformanceConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -177,17 +179,18 @@ class ConfConfigSuite extends FunSuite with SparkTestBase {
"--folder-prefix", folderPrefix,
"--menas-credentials-file", menasCredentialsFile,
"--debug-set-publish-path", hdfsPublishPathOverride))
- val publishPathNoFolderPrefix = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn,
- cmdConfigNoFolderPrefix, conformanceDataset, cmdConfigNoFolderPrefix.reportVersion.get)
+ val publishPathNoFolderPrefix = TestDynamicConformance.buildPublishPath(cmdConfigNoFolderPrefix,
+ conformanceDataset, cmdConfigNoFolderPrefix.reportVersion.get)
assert(publishPathNoFolderPrefix === s"$hdfsPublishPath/$infoDateColumn=$reportDate/$infoVersionColumn=$reportVersion")
- val publishPathFolderPrefix = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn,
- cmdConfigFolderPrefix, conformanceDataset, cmdConfigFolderPrefix.reportVersion.get)
+ val publishPathFolderPrefix = TestDynamicConformance.buildPublishPath(cmdConfigFolderPrefix,
+ conformanceDataset, cmdConfigFolderPrefix.reportVersion.get)
assert(publishPathFolderPrefix === s"$hdfsPublishPath/$folderPrefix/$infoDateColumn=$reportDate/$infoVersionColumn=$reportVersion")
- val publishPathPublishPathOverride = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn,
- cmdConfigPublishPathOverride, conformanceDataset, cmdConfigPublishPathOverride.reportVersion.get)
+ val publishPathPublishPathOverride = TestDynamicConformance.buildPublishPath(cmdConfigPublishPathOverride, conformanceDataset, cmdConfigPublishPathOverride.reportVersion.get)
assert(publishPathPublishPathOverride === hdfsPublishPathOverride)
- val publishPathPublishPathOverrideAndFolderPrefix = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn,
- cmdConfigPublishPathOverrideAndFolderPrefix, conformanceDataset, cmdConfigPublishPathOverrideAndFolderPrefix.reportVersion.get)
+
+ val publishPathPublishPathOverrideAndFolderPrefix =
+ TestDynamicConformance.buildPublishPath(cmdConfigPublishPathOverrideAndFolderPrefix,
+ conformanceDataset, cmdConfigPublishPathOverrideAndFolderPrefix.reportVersion.get)
assert(publishPathPublishPathOverrideAndFolderPrefix === hdfsPublishPathOverride)
}
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala
index cb3155a71..dcb6e46a6 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala
@@ -18,7 +18,7 @@ package za.co.absa.enceladus.conformance.interpreter
import org.apache.spark.sql.functions._
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.conformance.samples._
@@ -30,7 +30,7 @@ class ArrayConformanceSuite extends FunSuite with SparkTestBase with BeforeAndAf
// spark.enableControlFrameworkTracking()
implicit var dao: MenasDAO = _
- implicit var progArgs: ConfCmdConfig = _
+ implicit var progArgs: ConformanceConfig = _
private val enableCF = false
private val isCatalystWorkaroundEnabled = true
@@ -40,7 +40,7 @@ class ArrayConformanceSuite extends FunSuite with SparkTestBase with BeforeAndAf
val mapDF = spark.createDataFrame(MappingsSamples.mapping)
dao = mock(classOf[MenasDAO])
- progArgs = new ConfCmdConfig(reportDate = "2017-11-01")
+ progArgs = new ConformanceConfig(reportDate = "2017-11-01")
mockWhen(dao.getMappingTable("mapping", 0)) thenReturn MappingsSamples.mappingTable
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala
index be75f9317..2d4873474 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala
@@ -17,7 +17,7 @@ package za.co.absa.enceladus.conformance.interpreter
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.FunSuite
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.MappingConformanceRule
@@ -43,7 +43,7 @@ class ChorusMockSuite extends FunSuite with SparkTestBase with LoggerTestBase {
val inputDf = spark.createDataFrame(d)
val mappingDf = spark.createDataFrame(mapping)
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2018-03-23") // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2018-03-23") // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = mock(classOf[MenasDAO]) // you may have to hard-code your own implementation here (if not working with menas)
val enableCF = false
val isCatalystWorkaroundEnabled = true
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala
index f4422d44d..cd8e28e50 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala
@@ -20,7 +20,7 @@ import org.json4s.native.JsonParser._
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.{BeforeAndAfterAll, FunSuite}
import za.co.absa.atum.model.ControlMeasure
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.conformance.samples._
@@ -49,7 +49,8 @@ class InterpreterSuite extends FunSuite with SparkTestBase with BeforeAndAfterAl
spark.sessionState.conf.setConfString("co.za.absa.enceladus.confTest", "hello :)")
implicit val dao: MenasDAO = mock(classOf[MenasDAO])
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01", experimentalMappingRule = Option(useExperimentalMappingRule))
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(
+ experimentalMappingRule = Option(useExperimentalMappingRule),reportDate = "2017-11-01")
val enableCF = true
val isCatalystWorkaroundEnabled = true
@@ -104,7 +105,9 @@ class InterpreterSuite extends FunSuite with SparkTestBase with BeforeAndAfterAl
spark.enableControlMeasuresTracking("src/test/testData/_tradeData/2017/11/01/_INFO", "src/test/testData/_tradeOutput/_INFO")
implicit val dao: MenasDAO = mock(classOf[MenasDAO])
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01", experimentalMappingRule = Option(useExperimentalMappingRule))
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(
+ experimentalMappingRule = Option(useExperimentalMappingRule),
+ reportDate = "2017-11-01")
val enableCF = true
val isCatalystWorkaroundEnabled = true
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala
index 346613159..fa78f5a87 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala
@@ -17,7 +17,7 @@ package za.co.absa.enceladus.conformance.interpreter
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.FunSuite
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{DropConformanceRule, LiteralConformanceRule, MappingConformanceRule}
@@ -33,7 +33,7 @@ class LiteralJoinMappingRuleTest extends FunSuite with SparkTestBase with Logger
val inputDf = spark.read.option("header", "true").csv("src/test/resources/interpreter/literalJoin/data")
val mappingDf = spark.read.option("header", "true").csv("src/test/resources/interpreter/literalJoin/mapping")
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2018-03-23")
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2018-03-23")
implicit val dao: MenasDAO = mock(classOf[MenasDAO])
val enableCF = false
val isCatalystWorkaroundEnabled = true
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala
index 28f73a669..731a1ffab 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.{DataFrame, SaveMode}
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.{BeforeAndAfterAll, Suite}
import org.slf4j.{Logger, LoggerFactory}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.datasource.DataSource
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
@@ -38,7 +38,7 @@ trait NestedStructsFixture extends BeforeAndAfterAll with SparkTestBase {
protected var standardizedDf: DataFrame = _
implicit protected val dao: MenasDAO = mock(classOf[MenasDAO])
- implicit protected val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01")
+ implicit protected val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01")
protected val upperRule1 = UppercaseConformanceRule(order = 1, inputColumn = "strings.with_new_lines",
controlCheckpoint = false, outputColumn = "strings.with_new_lines_upper")
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala
index d979b2fce..91f241c88 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala
@@ -15,22 +15,22 @@
package za.co.absa.enceladus.conformance.interpreter.fixtures
-import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.execution.streaming.MemoryStream
-import org.apache.spark.sql.streaming.Trigger
+import org.apache.spark.sql.{DataFrame, Row}
import org.scalatest.FunSuite
import org.scalatest.mockito.MockitoSugar
-import za.co.absa.enceladus.conformance.{ConfCmdConfig, HyperConformance}
import za.co.absa.enceladus.conformance.interpreter.FeatureSwitches
-import za.co.absa.enceladus.conformance.streaming.{InfoDateFactory, InfoDateLiteralFactory}
+import za.co.absa.enceladus.conformance.streaming.InfoDateFactory
+import za.co.absa.enceladus.conformance.HyperConformance
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
trait StreamingFixture extends FunSuite with SparkTestBase with MockitoSugar {
implicit val menasBaseUrls: List[String] = List.empty
- implicit val cmd: ConfCmdConfig = ConfCmdConfig.apply(reportVersion = Some(1))
+ implicit val cmd: ConformanceConfig = ConformanceConfig(reportVersion = Some(1))
protected def testHyperConformance(input: DataFrame,
sinkTableName: String,
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala
index 9e24009ee..dbc56aba3 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql.types._
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.FunSuite
import org.slf4j.event.Level.ERROR
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches, RuleValidators}
import za.co.absa.enceladus.conformance.samples.CastingRuleSamples
import za.co.absa.enceladus.dao.MenasDAO
@@ -36,7 +36,7 @@ class CastingRuleSuite extends FunSuite with SparkTestBase with LoggerTestBase {
val inputDf = spark.read.schema(CastingRuleSamples.ordersSchema).json(CastingRuleSamples.ordersData.toDS)
implicit val dao: MenasDAO = mock(classOf[MenasDAO])
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01")
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01")
val experimentalMR = true
val isCatalystWorkaroundEnabled = true
val enableCF: Boolean = false
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala
index 4662e2296..515738963 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql.Dataset
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.FunSuite
import org.slf4j.event.Level.ERROR
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.conformance.samples.NegationRuleSamples
import za.co.absa.enceladus.dao.MenasDAO
@@ -110,7 +110,7 @@ class NegationRuleSuite extends FunSuite with SparkTestBase with LoggerTestBase{
val inputDf = spark.read.schema(schema).json(inputDataset)
implicit val dao: MenasDAO = mock(classOf[MenasDAO])
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01")
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01")
val experimentalMR = true
val isCatalystWorkaroundEnabled = true
val enableCF: Boolean = false
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala
index fdfb5a208..725cd6172 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.scalatest.FunSuite
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.ExplosionState
import za.co.absa.enceladus.conformance.samples.EmployeeConformance
import za.co.absa.enceladus.dao.MenasDAO
@@ -32,7 +32,7 @@ class RulesSuite extends FunSuite with SparkTestBase {
private val dummyInterpreter = new RuleInterpreter {
override def conformanceRule: Option[ConformanceRule] = None
def conform(df: Dataset[Row])
- (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = df
+ (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = df
}
test("Test country code join condition") {
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala
index c85de793f..4fd1bdb07 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql.DataFrame
import org.mockito.Mockito.{mock, when => mockWhen}
import org.scalatest.FunSuite
import org.slf4j.event.Level._
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
@@ -30,7 +30,7 @@ trait TestRuleBehaviors extends FunSuite with SparkTestBase with LoggerTestBase
def conformanceRuleShouldMatchExpected(inputDf: DataFrame, inputDataset: Dataset, expectedJSON: String) {
implicit val dao: MenasDAO = mock(classOf[MenasDAO])
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01")
+ implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01")
val experimentalMR = true
val isCatalystWorkaroundEnabled = true
val enableCF: Boolean = false
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala
index 3dbde2dca..1121245ef 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala
@@ -19,7 +19,7 @@ import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.mockito.Mockito.mock
import org.scalatest.FunSuite
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.rules.RuleInterpreter
import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, ExplosionState, FeatureSwitches}
import za.co.absa.enceladus.dao.MenasDAO
@@ -41,7 +41,7 @@ case class MyCustomRule(
case class MyCustomRuleInterpreter(rule: MyCustomRule) extends RuleInterpreter {
override def conformanceRule: Option[ConformanceRule] = Some(rule)
- def conform(df: Dataset[Row])(implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = {
+ def conform(df: Dataset[Row])(implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = {
import spark.implicits._
// we have to do this if this rule is to support arrays
handleArrays(rule.outputColumn, df) { flattened =>
@@ -60,7 +60,7 @@ class CustomRuleSuite extends FunSuite with SparkTestBase {
// we may WANT to enable control framework & spline here
- implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules)
+ implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules)
implicit val dao: MenasDAO = mock(classOf[MenasDAO]) // you may have to hard-code your own implementation here (if not working with menas)
val experimentalMR = true
val isCatalystWorkaroundEnabled = true
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala
index ee264ed31..fbf547f10 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala
@@ -19,7 +19,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, types}
import org.mockito.Mockito.{mock, when => mockWhen}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{Always, FeatureSwitches, Never}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, MappingConformanceRule}
@@ -228,14 +228,14 @@ class NestedTestCaseFactory(implicit spark: SparkSession) {
*/
def getTestCase(experimentalMappingRule: Boolean,
enableMappingRuleBroadcasting: Boolean,
- conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConfCmdConfig, FeatureSwitches) = {
+ conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConformanceConfig, FeatureSwitches) = {
val inputDf = spark.read
.schema(testCaseSchema)
.json(getClass.getResource("/interpreter/mappingCases/nestedDf.json").getPath)
val dataset = getDataSetWithConformanceRules(testCaseDataset, conformanceRules: _*)
- val cmdConfig = ConfCmdConfig(reportDate = reportDate)
+ val cmdConfig = ConformanceConfig(reportDate = reportDate)
val dao = mock(classOf[MenasDAO])
mockWhen(dao.getDataset(testCaseName, 1)) thenReturn testCaseDataset
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala
index 1eae015de..84f508c1b 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala
@@ -19,7 +19,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.mockito.Mockito.{mock, when => mockWhen}
-import za.co.absa.enceladus.conformance.ConfCmdConfig
+import za.co.absa.enceladus.conformance.config.ConformanceConfig
import za.co.absa.enceladus.conformance.interpreter.{Always, FeatureSwitches, Never}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, MappingConformanceRule}
@@ -138,10 +138,10 @@ class SimpleTestCaseFactory(implicit spark: SparkSession) {
*/
def getTestCase(experimentalMappingRule: Boolean,
enableMappingRuleBroadcasting: Boolean,
- conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConfCmdConfig, FeatureSwitches) = {
+ conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConformanceConfig, FeatureSwitches) = {
val inputDf = spark.read.schema(testCaseSchema).json(testCaseDataJson.toDS)
val dataset = getDataSetWithConformanceRules(testCaseDataset, conformanceRules: _*)
- val cmdConfig = ConfCmdConfig(reportDate = reportDate)
+ val cmdConfig = ConformanceConfig(reportDate = reportDate)
val dao = mock(classOf[MenasDAO])
mockWhen(dao.getDataset(testCaseName, 1)) thenReturn testCaseDataset
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala
index c114f2a49..2081ac3ef 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala
@@ -23,6 +23,7 @@ import org.scalatest.mockito.MockitoSugar
import org.scalatest.{Outcome, fixture}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.standardization.fixtures.TempFileFixture
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
@@ -32,6 +33,8 @@ class StandardizationCobolAsciiSuite extends fixture.FunSuite with SparkTestBase
private implicit val dao: MenasDAO = mock[MenasDAO]
+ private val standardizationReader = new StandardizationPropertiesProvider()
+
private val tmpFilePrefix = "cobol-fix-ascii-"
private val tmpFileSuffix = ".dat"
@@ -69,8 +72,8 @@ class StandardizationCobolAsciiSuite extends fixture.FunSuite with SparkTestBase
private def getTestDataFrame(tmpFileName: String,
args: Array[String]
): DataFrame = {
- val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(argumentsBase ++ args)
- val cobolReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet, schema.fields.length)
+ val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(argumentsBase ++ args)
+ val cobolReader = standardizationReader.getFormatSpecificReader(cmd, dataSet, schema.fields.length)
cobolReader
.option("copybook_contents", copybook)
.load(tmpFileName)
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala
index 46e0b6fd2..59a3ec86c 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala
@@ -21,6 +21,7 @@ import org.scalatest.mockito.MockitoSugar
import org.scalatest.{Outcome, fixture}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.standardization.fixtures.TempFileFixture
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
@@ -30,6 +31,8 @@ class StandardizationCobolEbcdicSuite extends fixture.FunSuite with SparkTestBas
private implicit val dao: MenasDAO = mock[MenasDAO]
+ private val standardizationReader = new StandardizationPropertiesProvider()
+
private val tmpFilePrefix = "cobol-fix-ebcdic-"
private val tmpFileSuffix = ".dat"
@@ -70,8 +73,8 @@ class StandardizationCobolEbcdicSuite extends fixture.FunSuite with SparkTestBas
private def getTestDataFrame(tmpFileName: String,
args: Array[String]
): DataFrame = {
- val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(argumentsBase ++ args)
- val cobolReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet, schema.fields.length)
+ val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(argumentsBase ++ args)
+ val cobolReader = standardizationReader.getFormatSpecificReader(cmd, dataSet, schema.fields.length)
cobolReader
.option("copybook_contents", copybook)
.load(tmpFileName)
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala
index e77298cb0..44e51f04a 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala
@@ -1,10 +1,27 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package za.co.absa.enceladus.standardization
import org.apache.spark.sql.types.{DataType, StructType}
import org.scalatest.FunSuite
import org.scalatest.mockito.MockitoSugar
+import org.slf4j.{Logger, LoggerFactory}
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter
import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator
import za.co.absa.enceladus.utils.fs.FileReader
@@ -14,7 +31,7 @@ import za.co.absa.enceladus.utils.udf.UDFLibrary
class StandardizationFixedWidthSuite extends FunSuite with SparkTestBase with MockitoSugar{
private implicit val udfLibrary:UDFLibrary = new UDFLibrary()
-
+ private val log: Logger = LoggerFactory.getLogger(this.getClass)
private val argsBase = ("--dataset-name Foo --dataset-version 1 --report-date 2020-06-22 --report-version 1 " +
"--menas-auth-keytab src/test/resources/user.keytab.example " +
"--raw-format fixed-width").split(" ")
@@ -28,9 +45,9 @@ class StandardizationFixedWidthSuite extends FunSuite with SparkTestBase with Mo
).asInstanceOf[StructType]
test("Reading data from FixedWidth input") {
- val cmd = StdCmdConfig.getCmdLineArguments(argsBase)
+ val cmd = StandardizationConfig.getFromArguments(argsBase)
- val fixedWidthReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet)
+ val fixedWidthReader = new StandardizationPropertiesProvider().getFormatSpecificReader(cmd, dataSet)
val inputSchema = PlainSchemaGenerator.generateInputSchema(baseSchema)
val reader = fixedWidthReader.schema(inputSchema)
@@ -47,9 +64,9 @@ class StandardizationFixedWidthSuite extends FunSuite with SparkTestBase with Mo
}
test("Reading data from FixedWidth input trimmed") {
- val cmd = StdCmdConfig.getCmdLineArguments(argsBase ++ Array("--trimValues", "true"))
+ val cmd = StandardizationConfig.getFromArguments(argsBase ++ Array("--trimValues", "true"))
- val fixedWidthReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet)
+ val fixedWidthReader = new StandardizationPropertiesProvider().getFormatSpecificReader(cmd, dataSet)
val inputSchema = PlainSchemaGenerator.generateInputSchema(baseSchema)
val reader = fixedWidthReader.schema(inputSchema)
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala
index f5cd49663..278076f22 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala
@@ -18,8 +18,10 @@ package za.co.absa.enceladus.standardization
import org.apache.spark.sql.types.{DataType, StructType}
import org.scalatest.FunSuite
import org.scalatest.mockito.MockitoSugar
+import org.slf4j.Logger
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter
import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator
import za.co.absa.enceladus.utils.fs.FileReader
@@ -30,6 +32,8 @@ import za.co.absa.enceladus.utils.udf.UDFLibrary
class StandardizationJsonSuite extends FunSuite with SparkTestBase with MockitoSugar{
private implicit val udfLibrary:UDFLibrary = new UDFLibrary()
+ private val standardizationReader = new StandardizationPropertiesProvider()
+
test("Reading data from JSON input, also such that don't adhere to desired schema") {
implicit val dao: MenasDAO = mock[MenasDAO]
@@ -39,9 +43,9 @@ class StandardizationJsonSuite extends FunSuite with SparkTestBase with MockitoS
"--raw-format json").split(" ")
val dataSet = Dataset("SpecialChars", 1, None, "", "", "SpecialChars", 1, conformance = Nil)
- val cmd = StdCmdConfig.getCmdLineArguments(args)
+ val cmd = StandardizationConfig.getFromArguments(args)
- val csvReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet)
+ val csvReader = standardizationReader.getFormatSpecificReader(cmd, dataSet)
val baseSchema: StructType = DataType.fromJson(
FileReader.readFileAsString("src/test/resources/data/standardization_json_suite_schema.json")
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala
index cf502913a..0e758922e 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala
@@ -21,9 +21,11 @@ import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._
import org.scalatest.mockito.MockitoSugar
import org.scalatest.{Outcome, fixture}
+import org.slf4j.Logger
import za.co.absa.enceladus.common.RecordIdGeneration.IdType
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.standardization.fixtures.TempFileFixture
import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter
import za.co.absa.enceladus.standardization.interpreter.stages.TypeParserException
@@ -38,6 +40,7 @@ class StandardizationParquetSuite extends fixture.FunSuite with SparkTestBase wi
import spark.implicits._
import za.co.absa.enceladus.utils.implicits.DataFrameImplicits.DataFrameEnhancements
+ private val standardizationReader = new StandardizationPropertiesProvider()
private implicit val dao: MenasDAO = mock[MenasDAO]
private implicit val udfLibrary:UDFLibrary = new UDFLibrary()
@@ -62,9 +65,9 @@ class StandardizationParquetSuite extends fixture.FunSuite with SparkTestBase wi
/** Creates a dataframe from an input file name path and command line arguments to Standardization */
private def getTestDataFrame(tmpFileName: String,
- args: Array[String]): (StdCmdConfig, DataFrame) = {
- val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args)
- val csvReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet)
+ args: Array[String]): (StandardizationConfig, DataFrame) = {
+ val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args)
+ val csvReader = standardizationReader.getFormatSpecificReader(cmd, dataSet)
(cmd, csvReader.load(tmpFileName).orderBy("id"))
}
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala
index a261214e0..a57e50a70 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala
@@ -22,8 +22,10 @@ import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.scalatest.mockito.MockitoSugar
import org.scalatest.{Outcome, fixture}
+import org.slf4j.Logger
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.standardization.fixtures.TempFileFixture
import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter
import za.co.absa.enceladus.utils.error.ErrorMessage
@@ -38,6 +40,8 @@ class StandardizationRerunSuite extends fixture.FunSuite with SparkTestBase with
private implicit val udfLib: UDFLibrary = new UDFLibrary
private implicit val dao: MenasDAO = mock[MenasDAO]
+ private val standardizationReader = new StandardizationPropertiesProvider()
+
private val tmpDirPrefix = "StdRerunTest"
private val tmpFilePrefix = "test-input-"
private val tmpFileSuffix = ".csv"
@@ -65,8 +69,8 @@ class StandardizationRerunSuite extends fixture.FunSuite with SparkTestBase with
"--menas-auth-keytab src/test/resources/user.keytab.example " +
"--raw-format csv --header false --delimiter |").split(" ")
- val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args)
- StandardizationJob
+ val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args)
+ standardizationReader
.getFormatSpecificReader(cmd, dataSet, schemaWithStringType.fields.length)
.schema(schemaWithStringType)
.load(tmpFileName)
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StdConfigSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/config/StandardizationParserSuite.scala
similarity index 82%
rename from spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StdConfigSuite.scala
rename to spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/config/StandardizationParserSuite.scala
index ffa432660..524e0180b 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StdConfigSuite.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/config/StandardizationParserSuite.scala
@@ -13,16 +13,17 @@
* limitations under the License.
*/
-package za.co.absa.enceladus.standardization
+package za.co.absa.enceladus.standardization.config
import java.time.ZonedDateTime
import org.scalatest.FunSuite
import za.co.absa.enceladus.dao.auth.{MenasKerberosCredentials, MenasPlainCredentials}
import za.co.absa.enceladus.model.Dataset
+import za.co.absa.enceladus.standardization.StandardizationExecution
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
-class StdConfigSuite extends FunSuite with SparkTestBase {
+class StandardizationParserSuite extends FunSuite with SparkTestBase {
private val year = "2018"
private val month = "12"
@@ -52,6 +53,8 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
private val rawFormat = "parquet"
private val folderPrefix = s"year=$year/month=$month/day=$day"
+ private object TestStandardization extends StandardizationExecution
+
test("Test credentials file parsing "){
val credentials = MenasPlainCredentials.fromFile(menasCredentialsFile)
@@ -66,7 +69,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
}
test("folder-prefix parameter") {
- val cmdConfigNoFolderPrefix = StdCmdConfig.getCmdLineArguments(
+ val cmdConfigNoFolderPrefix = StandardizationConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -86,7 +89,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
assert(cmdConfigNoFolderPrefix.rawPathOverride.isEmpty)
assert(actualPlainMenasCredentials === menasCredentials)
- val cmdConfigFolderPrefix = StdCmdConfig.getCmdLineArguments(
+ val cmdConfigFolderPrefix = StandardizationConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -127,7 +130,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
userDisabled,
List()
)
- val cmdConfigNoFolderPrefix = StdCmdConfig.getCmdLineArguments(
+ val cmdConfigNoFolderPrefix = StandardizationConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -135,7 +138,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
"--report-version", reportVersion.toString,
"--menas-credentials-file", menasCredentialsFile,
"--raw-format", rawFormat))
- val cmdConfigFolderPrefix = StdCmdConfig.getCmdLineArguments(
+ val cmdConfigFolderPrefix = StandardizationConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -144,7 +147,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
"--menas-credentials-file", menasCredentialsFile,
"--folder-prefix", folderPrefix,
"--raw-format", rawFormat))
- val cmdConfigRawPathOverride = StdCmdConfig.getCmdLineArguments(
+ val cmdConfigRawPathOverride = StandardizationConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -153,7 +156,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
"--menas-credentials-file", menasCredentialsFile,
"--debug-set-raw-path", hdfsRawPathOverride,
"--raw-format", rawFormat))
- val cmdConfigRawPathOverrideAndFolderPrefix = StdCmdConfig.getCmdLineArguments(
+ val cmdConfigRawPathOverrideAndFolderPrefix = StandardizationConfig.getFromArguments(
Array(
"--dataset-name", datasetName,
"--dataset-version", datasetVersion.toString,
@@ -164,18 +167,17 @@ class StdConfigSuite extends FunSuite with SparkTestBase {
"--debug-set-raw-path", hdfsRawPathOverride,
"--raw-format", rawFormat))
-
- val publishPathNoFolderPrefix = StandardizationJob.buildRawPath(cmdConfigNoFolderPrefix, standardiseDataset,
- dateTokens, cmdConfigNoFolderPrefix.reportVersion.get)
+ val publishPathNoFolderPrefix = TestStandardization.buildRawPath(cmdConfigNoFolderPrefix, standardiseDataset,
+ cmdConfigNoFolderPrefix.reportVersion.get)
assert(publishPathNoFolderPrefix === s"${standardiseDataset.hdfsPath}/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v${cmdConfigNoFolderPrefix.reportVersion.get}")
- val publishPathFolderPrefix = StandardizationJob.buildRawPath(cmdConfigFolderPrefix, standardiseDataset,
- dateTokens, cmdConfigFolderPrefix.reportVersion.get)
+ val publishPathFolderPrefix = TestStandardization.buildRawPath(cmdConfigFolderPrefix, standardiseDataset,
+ cmdConfigFolderPrefix.reportVersion.get)
assert(publishPathFolderPrefix === s"${standardiseDataset.hdfsPath}/$folderPrefix/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v${cmdConfigFolderPrefix.reportVersion.get}")
- val publishPathRawPathOverride = StandardizationJob.buildRawPath(cmdConfigRawPathOverride, standardiseDataset,
- dateTokens, cmdConfigRawPathOverride.reportVersion.get)
+ val publishPathRawPathOverride = TestStandardization.buildRawPath(cmdConfigRawPathOverride, standardiseDataset,
+ cmdConfigRawPathOverride.reportVersion.get)
assert(publishPathRawPathOverride === hdfsRawPathOverride)
- val publishPathRawPathOverrideAndFolderPrefix = StandardizationJob.buildRawPath(cmdConfigRawPathOverrideAndFolderPrefix,
- standardiseDataset, dateTokens, cmdConfigRawPathOverrideAndFolderPrefix.reportVersion.get)
+ val publishPathRawPathOverrideAndFolderPrefix = TestStandardization.buildRawPath(cmdConfigRawPathOverrideAndFolderPrefix,
+ standardiseDataset, cmdConfigRawPathOverrideAndFolderPrefix.reportVersion.get)
assert(publishPathRawPathOverrideAndFolderPrefix === hdfsRawPathOverride)
}
diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala
index e54cb9036..2d59a6dad 100644
--- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala
+++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala
@@ -17,16 +17,19 @@ package za.co.absa.enceladus.standardization.fixtures
import java.io.File
import java.nio.charset.{Charset, StandardCharsets}
+
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.mockito.MockitoSugar
import za.co.absa.enceladus.dao.MenasDAO
import za.co.absa.enceladus.model.Dataset
-import za.co.absa.enceladus.standardization.{StandardizationJob, StdCmdConfig}
+import za.co.absa.enceladus.standardization.StandardizationPropertiesProvider
+import za.co.absa.enceladus.standardization.config.StandardizationConfig
import za.co.absa.enceladus.utils.testUtils.SparkTestBase
trait CsvFileFixture extends MockitoSugar with TempFileFixture with SparkTestBase {
private implicit val dao: MenasDAO = mock[MenasDAO]
+ private val standardizationReader = new StandardizationPropertiesProvider()
type FixtureParam = String
private val tmpFilePrefix = "special-characters"
@@ -57,11 +60,11 @@ trait CsvFileFixture extends MockitoSugar with TempFileFixture with SparkTestBas
dataSet: Dataset,
schema: StructType
): DataFrame = {
- val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args)
+ val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args)
val csvReader = if (checkMaxColumns) {
- StandardizationJob.getFormatSpecificReader(cmd, dataSet, schema.fields.length)
+ standardizationReader.getFormatSpecificReader(cmd, dataSet, schema.fields.length)
} else {
- StandardizationJob.getFormatSpecificReader(cmd, dataSet)
+ standardizationReader.getFormatSpecificReader(cmd, dataSet)
}
csvReader
.schema(schema)
diff --git a/utils/src/main/scala/za/co/absa/enceladus/utils/modules/SourcePhase.scala b/utils/src/main/scala/za/co/absa/enceladus/utils/modules/SourcePhase.scala
new file mode 100644
index 000000000..643b2a85e
--- /dev/null
+++ b/utils/src/main/scala/za/co/absa/enceladus/utils/modules/SourcePhase.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.enceladus.utils.modules
+
+/**
+ * Stands to represent the source part (standardization or conformance) regardless of the Job class
+ */
+sealed trait SourcePhase {
+ val value: String
+
+ def asIdentifier: String = value.toLowerCase
+}
+
+object SourcePhase {
+ def withIdentifier(name: String): SourcePhase = {
+ name match {
+ case "conformance" => SourcePhase.Conformance
+ case "standardization" => SourcePhase.Standardization
+ case _ => throw new NoSuchElementException(s"No value found for '$name'")
+ }
+ }
+
+ case object Standardization extends SourcePhase {
+ val value = "Standardization"
+ }
+
+ case object Conformance extends SourcePhase {
+ val value = "Conformance"
+ }
+}