diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala index b369932e4..58fe2d223 100644 --- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala +++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample1.scala @@ -16,7 +16,7 @@ package za.co.absa.enceladus.examples import org.apache.spark.sql.{DataFrame, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials @@ -41,7 +41,7 @@ object CustomRuleSample1 { // scalastyle:off magic.number val menasBaseUrls = List("http://localhost:8080/menas") val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example") - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala index 6eef8ed86..de79ace35 100644 --- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala +++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample2.scala @@ -17,7 +17,7 @@ package za.co.absa.enceladus.examples import com.typesafe.config.ConfigFactory import org.apache.spark.sql.{DataFrame, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials @@ -43,7 +43,7 @@ object CustomRuleSample2 { val conf = ConfigFactory.load() val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example") - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala index 45ef541bf..932fa9fac 100644 --- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala +++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample3.scala @@ -17,7 +17,7 @@ package za.co.absa.enceladus.examples import com.typesafe.config.ConfigFactory import org.apache.spark.sql.{DataFrame, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials @@ -38,7 +38,7 @@ object CustomRuleSample3 { val conf = ConfigFactory.load() val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example") - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala index d88607cb8..fcae9619e 100644 --- a/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala +++ b/examples/src/main/scala/za/co/absa/enceladus/examples/CustomRuleSample4.scala @@ -19,7 +19,7 @@ import com.typesafe.config.ConfigFactory import org.apache.spark.sql.functions.{col, concat, concat_ws, lit} import org.apache.spark.sql.{DataFrame, DataFrameReader, SparkSession} import scopt.OptionParser -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials @@ -142,7 +142,7 @@ object CustomRuleSample4 { val conf = ConfigFactory.load() val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/main/resources/user.keytab.example") - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas) val dfReader: DataFrameReader = { diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala index 934a1414f..9c4b41a8c 100644 --- a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala +++ b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRule.scala @@ -16,7 +16,7 @@ package za.co.absa.enceladus.examples.interpreter.rules.custom import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Dataset, Row, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.conformance.interpreter.rules.RuleInterpreter import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule @@ -40,7 +40,7 @@ case class UppercaseCustomRuleInterpreter(rule: UppercaseCustomConformanceRule) override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { handleArrays(rule.outputColumn, df) { flattened => // we have to do this if this rule is to support arrays diff --git a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala index 7fe746606..c2b061076 100644 --- a/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala +++ b/examples/src/main/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRule.scala @@ -16,7 +16,7 @@ package za.co.absa.enceladus.examples.interpreter.rules.custom import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, Dataset, Row, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.conformance.interpreter.rules.RuleInterpreter import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule @@ -40,7 +40,7 @@ case class StringFuncInterpreter(rule: ColumnFunctionCustomConformanceRule) exte override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { handleArrays(rule.outputColumn, df) { flattened => // we have to do this if this rule is to support arrays diff --git a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala index de836828c..fb0202ad2 100644 --- a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala +++ b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/UppercaseCustomConformanceRuleSuite.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql import org.apache.spark.sql.DataFrame import org.scalatest.FunSuite import org.scalatest.mockito.MockitoSugar -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset @@ -35,7 +35,7 @@ object TestOutputRow { class UppercaseCustomConformanceRuleSuite extends FunSuite with SparkTestBase with MockitoSugar { import spark.implicits._ - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = mock[MenasDAO] // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true diff --git a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala index 16c71401e..0716dd4a4 100644 --- a/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala +++ b/examples/src/test/scala/za/co/absa/enceladus/examples/interpreter/rules/custom/XPadCustomConformanceRuleSuite.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql import org.apache.spark.sql.DataFrame import org.scalatest.FunSuite import org.scalatest.mockito.MockitoSugar -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.dao.auth.MenasKerberosCredentials @@ -37,7 +37,7 @@ object XPadTestOutputRow { class LpadCustomConformanceRuleSuite extends FunSuite with SparkTestBase with MockitoSugar { import spark.implicits._ - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = mock[MenasDAO] // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true @@ -185,7 +185,7 @@ class RpadCustomConformanceRuleSuite extends FunSuite with SparkTestBase { private val conf = ConfigFactory.load() private val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) private val meansCredentials = MenasKerberosCredentials("user@EXAMPLE.COM", "src/test/resources/user.keytab.example") - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = RestDaoFactory.getInstance(meansCredentials, menasBaseUrls) // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true diff --git a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala index d79331e98..449bb5d42 100644 --- a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala +++ b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginImpl.scala @@ -18,7 +18,7 @@ package za.co.absa.enceladus.plugins.builtin.errorsender.mq import org.apache.log4j.LogManager import org.apache.spark.sql.functions.{col, explode, lit, size, struct} import org.apache.spark.sql.types.DataTypes -import org.apache.spark.sql.{DataFrame, Encoders} +import org.apache.spark.sql.{DataFrame, Encoder, Encoders} import za.co.absa.enceladus.plugins.api.postprocessor.PostProcessor import za.co.absa.enceladus.plugins.builtin.common.mq.kafka.KafkaConnectionParams import za.co.absa.enceladus.plugins.builtin.errorsender.DceError @@ -27,8 +27,8 @@ import za.co.absa.enceladus.utils.schema.SchemaUtils import KafkaErrorSenderPluginImpl._ import za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams -import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId import za.co.absa.enceladus.utils.error.ErrorMessage.ErrorCodes +import za.co.absa.enceladus.utils.modules._ import scala.util.{Failure, Success, Try} @@ -87,8 +87,8 @@ case class KafkaErrorSenderPluginImpl(connectionParams: KafkaConnectionParams, * @return DF with exploded errors and corresponding to the given error source */ def getIndividualErrors(dataFrame: DataFrame, params: ErrorSenderPluginParams): DataFrame = { - implicit val singleErrorStardardizedEncoder = Encoders.product[SingleErrorStardardized] - implicit val dceErrorEncoder = Encoders.product[DceError] + implicit val singleErrorStardardizedEncoder: Encoder[SingleErrorStardardized] = Encoders.product[SingleErrorStardardized] + implicit val dceErrorEncoder: Encoder[DceError] = Encoders.product[DceError] val allowedErrorCodes = KafkaErrorSenderPluginImpl.errorCodesForSource(params.sourceId) @@ -168,7 +168,7 @@ object KafkaErrorSenderPluginImpl { informationDate = Some(reportDate.toLocalDate.toEpochDay.toInt), outputFileName = Some(additionalParams.outputPath), recordId = recordId, - errorSourceId = additionalParams.sourceId.toString, + errorSourceId = additionalParams.sourceId.value, errorType = singleError.errType, errorCode = singleError.errCode, errorDescription = singleError.errMsg, @@ -184,9 +184,9 @@ object KafkaErrorSenderPluginImpl { } } - def errorCodesForSource(sourceId: ErrorSourceId.Value): Seq[String] = sourceId match { - case ErrorSourceId.Standardization => ErrorCodes.standardizationErrorCodes - case ErrorSourceId.Conformance => ErrorCodes.conformanceErrorCodes + def errorCodesForSource(sourceId: SourcePhase): Seq[String] = sourceId match { + case SourcePhase.Standardization => ErrorCodes.standardizationErrorCodes + case SourcePhase.Conformance => ErrorCodes.conformanceErrorCodes } } diff --git a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala index 6e9df9309..9c8deb84b 100644 --- a/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala +++ b/plugins-builtin/src/main/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParams.scala @@ -17,14 +17,14 @@ package za.co.absa.enceladus.plugins.builtin.errorsender.params import java.time.Instant -import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId +import za.co.absa.enceladus.utils.modules.SourcePhase case class ErrorSenderPluginParams(datasetName: String, datasetVersion: Int, reportDate: String, reportVersion: Int, outputPath: String, - sourceId: ErrorSourceId.Value, + sourceId: SourcePhase, sourceSystem: String, runUrls: Option[String], runId: Option[Int], @@ -37,11 +37,6 @@ case class ErrorSenderPluginParams(datasetName: String, object ErrorSenderPluginParams { - object ErrorSourceId extends Enumeration { - val Standardization = Value("standardizaton") - val Conformance = Value("conformance") - } - object FieldNames { val datasetName = "datasetName" val datasetVersion = "datasetVersion" @@ -65,7 +60,7 @@ object ErrorSenderPluginParams { reportDate -> params.reportDate, reportVersion -> params.reportVersion.toString, outputPath -> params.outputPath, - sourceId -> params.sourceId.toString, + sourceId -> params.sourceId.asIdentifier, sourceSystem -> params.sourceSystem, processingTimestamp -> params.processingTimestamp.toString ) ++ @@ -80,7 +75,7 @@ object ErrorSenderPluginParams { reportDate = params(reportDate), reportVersion = params(reportVersion).toInt, outputPath = params(outputPath), - sourceId = ErrorSourceId.withName(params(sourceId)), + sourceId = SourcePhase.withIdentifier(params(sourceId)), sourceSystem = params(sourceSystem), runUrls = params.get(runUrls), runId = params.get(runId).map(_.toInt), diff --git a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala index b1b01aa37..de95d10e4 100644 --- a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala +++ b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/mq/KafkaErrorSenderPluginSuite.scala @@ -29,7 +29,7 @@ import za.co.absa.enceladus.plugins.builtin.errorsender.DceError import za.co.absa.enceladus.plugins.builtin.errorsender.mq.KafkaErrorSenderPluginSuite.{TestingErrCol, TestingRecord} import za.co.absa.enceladus.plugins.builtin.errorsender.mq.kafka.KafkaErrorSenderPlugin import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams -import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId +import za.co.absa.enceladus.utils.modules.SourcePhase import za.co.absa.enceladus.utils.testUtils.SparkTestBase @@ -65,17 +65,17 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match import spark.implicits._ - val testDataDf = testData.toDF - val testNow = Instant.now() + private val testDataDf = testData.toDF + private val testNow = Instant.now() - val defaultPluginParams = ErrorSenderPluginParams( + private val defaultPluginParams = ErrorSenderPluginParams( "datasetName1", datasetVersion = 1, "2020-03-30", reportVersion = 1, "output/Path1", null, "sourceSystem1", Some("http://runUrls1"), runId = Some(1), Some("uniqueRunId"), testNow) "ErrorSenderPluginParams" should "getIndividualErrors (exploding, filtering by source for Standardization)" in { val plugin = KafkaErrorSenderPluginImpl(null, Map(), Map()) - plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = ErrorSourceId.Standardization)) + plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = SourcePhase.Standardization)) .as[DceError].collect.map(entry => (entry.errorType, entry.errorCode)) should contain theSameElementsAs Seq( ("stdCastError", "E00000"), ("stdNullError", "E00002"), @@ -87,7 +87,7 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match it should "getIndividualErrors (exploding, filtering by source for Conformance)" in { val plugin = KafkaErrorSenderPluginImpl(null, Map(), Map()) - plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = ErrorSourceId.Conformance)) + plugin.getIndividualErrors(testDataDf, defaultPluginParams.copy(sourceId = SourcePhase.Conformance)) .as[DceError].collect.map(entry => (entry.errorType, entry.errorCode)) should contain theSameElementsAs Seq( ("confMapError", "E00001"), ("confCastError", "E00003"), @@ -101,7 +101,7 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match val testKafkaUrl = "http://example.com:9092" val testSchemaRegUrl = "http://example.com:8081" - val testConfig = ConfigFactory.empty() + private val testConfig = ConfigFactory.empty() .withValue("kafka.error.client.id", ConfigValueFactory.fromAnyRef(testClientId)) .withValue("kafka.error.topic.name", ConfigValueFactory.fromAnyRef(testTopicName)) .withValue("kafka.bootstrap.servers", ConfigValueFactory.fromAnyRef(testKafkaUrl)) @@ -143,7 +143,7 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match // onlyConformanceErrorsDataDf should result in 0 std errors val onlyConformanceErrorsDataDf = Seq(testData(1)).toDF - errorKafkaPlugin.onDataReady(onlyConformanceErrorsDataDf, defaultPluginParams.copy(sourceId = ErrorSourceId.Standardization).toMap) + errorKafkaPlugin.onDataReady(onlyConformanceErrorsDataDf, defaultPluginParams.copy(sourceId = SourcePhase.Standardization).toMap) assert(sendErrorsToKafkaWasCalled == false, "KafkaErrorSenderPluginImpl.sentErrorToKafka should not be called for 0 errors") } @@ -160,11 +160,11 @@ class KafkaErrorSenderPluginSuite extends FlatSpec with SparkTestBase with Match } Seq( - ErrorSourceId.Standardization -> Seq( + SourcePhase.Standardization -> Seq( "standardizaton,stdCastError,E00000,Standardization Error - Type cast", "standardizaton,stdNullError,E00002,Standardization Error - Null detected in non-nullable attribute" ), - ErrorSourceId.Conformance -> Seq( + SourcePhase.Conformance -> Seq( "conformance,confNegErr,E00004,Conformance Negation Error", "conformance,confLitErr,E00005,Conformance Literal Error" ) diff --git a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala index 0c4513bd7..07c03eb6f 100644 --- a/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala +++ b/plugins-builtin/src/test/scala/za/co/absa/enceladus/plugins/builtin/errorsender/params/ErrorSenderPluginParamsSuite.scala @@ -18,17 +18,17 @@ package za.co.absa.enceladus.plugins.builtin.errorsender.params import java.time.Instant import org.scalatest.{FlatSpec, Matchers} -import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId +import za.co.absa.enceladus.utils.modules.SourcePhase class ErrorSenderPluginParamsSuite extends FlatSpec with Matchers { - val params = ErrorSenderPluginParams( + private val params = ErrorSenderPluginParams( datasetName = "datasetName1", datasetVersion = 1, reportDate = "2020-03-30", reportVersion = 1, outputPath = "output/Path1", - sourceId = ErrorSourceId.Conformance, + sourceId = SourcePhase.Conformance, sourceSystem = "sourceSystem1", runUrls = Some("http://runUrls1"), runId = Some(1), diff --git a/pom.xml b/pom.xml index 0b7997f48..50bd4e562 100644 --- a/pom.xml +++ b/pom.xml @@ -159,7 +159,7 @@ 3.6.4 2.10.0 0.5.0 - 3.7.0 + 4.0.0-RC2 0-10 3.1.1 2.0.0.RELEASE diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/CommonJobExecution.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/CommonJobExecution.scala new file mode 100644 index 000000000..38d9eebac --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/CommonJobExecution.scala @@ -0,0 +1,225 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.common + +import java.text.MessageFormat +import java.time.Instant + +import com.typesafe.config.{Config, ConfigFactory} +import org.apache.spark.SPARK_VERSION +import org.apache.spark.sql.SparkSession +import org.slf4j.{Logger, LoggerFactory} +import za.co.absa.atum.AtumImplicits +import za.co.absa.atum.core.Atum +import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig} +import za.co.absa.enceladus.common.plugin.PostProcessingService +import za.co.absa.enceladus.common.plugin.menas.{MenasPlugin, MenasRunUrl} +import za.co.absa.enceladus.common.version.SparkVersionGuard +import za.co.absa.enceladus.dao.MenasDAO +import za.co.absa.enceladus.dao.rest.MenasConnectionStringParser +import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams +import za.co.absa.enceladus.utils.config.SecureConfig +import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils +import za.co.absa.enceladus.utils.general.ProjectMetadataTools +import za.co.absa.enceladus.utils.modules.SourcePhase +import za.co.absa.enceladus.utils.performance.PerformanceMeasurer +import za.co.absa.enceladus.utils.time.TimeZoneNormalizer + +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +trait CommonJobExecution { + + protected case class PreparationResult( + dataset: Dataset, + reportVersion: Int, + pathCfg: PathConfig, + performance: PerformanceMeasurer + ) + + TimeZoneNormalizer.normalizeJVMTimeZone() + SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION) + + protected val log: Logger = LoggerFactory.getLogger(this.getClass) + protected val conf: Config = ConfigFactory.load() + protected val menasBaseUrls: List[String] = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) + + protected def obtainSparkSession[T]()(implicit cmd: JobConfigParser[T]): SparkSession = { + val enceladusVersion = ProjectMetadataTools.getEnceladusVersion + log.info(s"Enceladus version $enceladusVersion") + val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("") + val spark = SparkSession.builder() + .appName(s"Standardisation $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion") + .getOrCreate() + TimeZoneNormalizer.normalizeSessionTimeZone(spark) + spark + } + + protected def initialValidation(): Unit = { + // This should be the first thing the app does to make secure Kafka work with our CA. + // After Spring activates JavaX, it will be too late. + SecureConfig.setSecureKafkaProperties(conf) + } + + protected def prepareJob[T]() + (implicit dao: MenasDAO, + cmd: JobConfigParser[T], + fsUtils: FileSystemVersionUtils, + spark: SparkSession): PreparationResult = { + dao.authenticate() + val dataset = dao.getDataset(cmd.datasetName, cmd.datasetVersion) + val reportVersion = getReportVersion(cmd, dataset) + val pathCfg = getPathCfg(cmd, dataset, reportVersion) + + log.info(s"input path: ${pathCfg.inputPath}") + log.info(s"output path: ${pathCfg.outputPath}") + // die if the output path exists + validateForExistingOutputPath(fsUtils, pathCfg) + + val performance = initPerformanceMeasurer(pathCfg.inputPath) + + // Enable Spline + import za.co.absa.spline.core.SparkLineageInitializer._ + spark.enableLineageTracking() + + // Enable non-default persistence storage level if provided in the command line + cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel) + + PreparationResult(dataset, reportVersion, pathCfg, performance) + } + + protected def runPostProcessing[T](sourceId: SourcePhase, preparationResult: PreparationResult, jobCmdConfig: JobConfigParser[T]) + (implicit spark: SparkSession, fileSystemVersionUtils: FileSystemVersionUtils): Unit = { + val df = spark.read.parquet(preparationResult.pathCfg.outputPath) + val runId = MenasPlugin.runNumber + + if (runId.isEmpty) { + log.warn("No run number found, the Run URL cannot be properly reported!") + } + + // reporting the UI url(s) - if more than one, its comma-separated + val runUrl: Option[String] = runId.map { runNumber => + menasBaseUrls.map { menasBaseUrl => + MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, jobCmdConfig.datasetName, jobCmdConfig.datasetVersion, runNumber) + }.mkString(",") + } + + val sourceSystem = Atum.getControlMeasure.metadata.sourceApplication + val uniqueRunId = Atum.getControlMeasure.runUniqueId + + val params = ErrorSenderPluginParams(jobCmdConfig.datasetName, + jobCmdConfig.datasetVersion, jobCmdConfig.reportDate, preparationResult.reportVersion, preparationResult.pathCfg.outputPath, + sourceId, sourceSystem, runUrl, runId, uniqueRunId, Instant.now) + val postProcessingService = PostProcessingService(conf, params) + postProcessingService.onSaveOutput(df) + } + + protected def finishJob[T](jobConfig: JobConfigParser[T]): Unit = { + val name = jobConfig.datasetName + val version = jobConfig.datasetVersion + MenasPlugin.runNumber.foreach { runNumber => + menasBaseUrls.foreach { menasBaseUrl => + val apiUrl = MenasRunUrl.getMenasApiRunUrl(menasBaseUrl, name, version, runNumber) + val uiUrl = MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, name, version, runNumber) + + log.info(s"Menas API Run URL: $apiUrl") + log.info(s"Menas UI Run URL: $uiUrl") + } + } + } + + + protected def getPathCfg[T](cmd: JobConfigParser[T], dataset: Dataset, reportVetsion: Int): PathConfig + + protected def getStandardizationPath[T](jobConfig: JobConfigParser[T], reportVersion: Int): String = { + MessageFormat.format(conf.getString("standardized.hdfs.path"), + jobConfig.datasetName, + jobConfig.datasetVersion.toString, + jobConfig.reportDate, + reportVersion.toString) + } + + protected def handleControlInfoValidation(): Unit = { + ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match { + case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) => + val confEntry = "control.info.validation" + conf.getString(confEntry) match { + case "strict" => throw ex + case "warning" => log.warn(ex.msg) + case "none" => + case _ => throw new RuntimeException(s"Invalid $confEntry value") + } + case Failure(ex) => throw ex + case Success(_) => + } + } + + protected def validateForExistingOutputPath(fsUtils: FileSystemVersionUtils, pathCfg: PathConfig): Unit = { + if (fsUtils.hdfsExists(pathCfg.outputPath)) { + throw new IllegalStateException( + s"Path ${pathCfg.outputPath} already exists. Increment the run version, or delete ${pathCfg.outputPath}" + ) + } + } + + protected def writePerformanceMetrics[T](performance: PerformanceMeasurer, jobCmdConfig: JobConfigParser[T]): Unit = { + jobCmdConfig.performanceMetricsFile.foreach(fileName => try { + performance.writeMetricsToFile(fileName) + } catch { + case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}") + }) + } + + protected def handleEmptyOutput(job: SourcePhase)(implicit spark: SparkSession): Unit = { + import za.co.absa.atum.core.Constants._ + + val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints + .flatMap(checkpoint => + checkpoint.controls.filter(control => + control.controlName.equalsIgnoreCase(controlTypeRecordCount))) + .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D)) + + if (areCountMeasurementsAllZero) { + log.warn(s"Empty output after running $job. Previous checkpoints show this is correct.") + } else { + val errMsg = s"Empty output after running $job, while previous checkpoints show non zero record count" + AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(job.toString, errMsg, "") + throw new IllegalStateException(errMsg) + } + } + + private def getReportVersion[T](jobConfig: JobConfigParser[T], dataset: Dataset)(implicit fsUtils: FileSystemVersionUtils): Int = { + jobConfig.reportVersion match { + case Some(version) => version + case None => + val newVersion = fsUtils.getLatestVersion(dataset.hdfsPublishPath, jobConfig.reportDate) + 1 + log.warn(s"Report version not provided, inferred report version: $newVersion") + log.warn("This is an EXPERIMENTAL feature.") + log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.") + log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.") + newVersion + } + } + + private def initPerformanceMeasurer(path: String) + (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils): PerformanceMeasurer = { + val performance = new PerformanceMeasurer(spark.sparkContext.appName) + val stdDirSize = fsUtils.getDirectorySize(path) + performance.startMeasurement(stdDirSize) + performance + } +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/ConfigError.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/ConfigError.scala new file mode 100644 index 000000000..3a8570d3e --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/ConfigError.scala @@ -0,0 +1,18 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.common.config + +case class ConfigError(message: String) extends Exception(message) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/JobConfigParser.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/JobConfigParser.scala new file mode 100644 index 000000000..03b217588 --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/JobConfigParser.scala @@ -0,0 +1,119 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.common.config + +import org.apache.spark.storage.StorageLevel +import scopt.OParser +import za.co.absa.enceladus.dao.auth.{InvalidMenasCredentialsFactory, MenasCredentialsFactory, MenasKerberosCredentialsFactory, MenasPlainCredentialsFactory} + +import scala.util.matching.Regex + + +trait JobConfigParser[R] { + def withDatasetName(value: String): R + def withDatasetVersion(value: Int): R + def withReportDate(value: String): R + def withReportVersion(value: Option[Int]): R + def withPerformanceMetricsFile(value: Option[String]): R + def withFolderPrefix(value: Option[String]): R + def withCredsFile(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): R + def withAuthKeytab(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): R + def withPersistStorageLevel(value: Option[StorageLevel]): R + + def datasetName: String + def reportDate: String + def menasCredentialsFactory: MenasCredentialsFactory + def datasetVersion: Int + def reportVersion: Option[Int] + def performanceMetricsFile: Option[String] + def folderPrefix: Option[String] + def persistStorageLevel: Option[StorageLevel] + def credsFile: Option[String] + def keytabFile: Option[String] +} + +object JobConfigParser { + + //scalastyle:off method.length the length is legit for parsing input paramters + def jobConfigParser[R <: JobConfigParser[R]]: OParser[_, R] = { + val builder = OParser.builder[R] + import builder._ + OParser.sequence(head("Job Parameters"), + opt[String]('D', "dataset-name").required().action((value, config) => + config.withDatasetName(value)).text("Dataset name"), + + opt[Int]('d', "dataset-version").required().action((value, config) => + config.withDatasetVersion(value)).text("Dataset version") + .validate(value => + if (value > 0) { + success + } else { + failure("Option --dataset-version must be > 0") + }), + + opt[String]('R', "report-date").required().action((value, config) => + config.withReportDate(value)).text("Report date in 'yyyy-MM-dd' format") + .validate(value => { + val reportDateMatcher: Regex = "^\\d{4}-\\d{2}-\\d{2}$".r + reportDateMatcher.findFirstIn(value) match { + case None => failure(s"Match error in '$value'. Option --report-date expects a date in 'yyyy-MM-dd' format") + case _ => success + } + }), + + opt[Int]('r', "report-version").optional().action((value, config) => + config.withReportVersion(Some(value))) + .text("Report version. If not provided, it is inferred based on the publish path (it's an EXPERIMENTAL feature)") + .validate(value => + if (value > 0) { + success + } else { + failure("Option --report-version must be >0") + }), + + opt[String]("menas-credentials-file").hidden.optional().action({ (file, config) => + config.withCredsFile(Option(file), new MenasPlainCredentialsFactory(file)) + }).text("Path to Menas credentials config file."), + + opt[String]("menas-auth-keytab").optional().action({ (file, config) => { + config.withAuthKeytab(Option(file), new MenasKerberosCredentialsFactory(file)) + } + }).text("Path to keytab file used for authenticating to menas"), + + + opt[String]("performance-file").optional().action((value, config) => + config.withPerformanceMetricsFile(Option(value))) + .text("Produce a performance metrics file at the given location (local filesystem)"), + + opt[String]("folder-prefix").optional().action((value, config) => + config.withFolderPrefix(Option(value))).text("Adds a folder prefix before the infoDateColumn"), + + opt[String]("persist-storage-level").optional().action((value, config) => + config.withPersistStorageLevel(Some(StorageLevel.fromString(value)))) + .text("Specifies persistence storage level to use when processing data. Spark's default is MEMORY_AND_DISK."), + + checkConfig { config => + config.menasCredentialsFactory match { + case InvalidMenasCredentialsFactory => failure("No authentication method specified (e.g. --menas-auth-keytab)") + case _ if config.credsFile.isDefined && config.keytabFile.isDefined => + failure("Only one authentication method is allowed at a time") + case _ => success + } + } + ) + } + //scalastyle:on method.length +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/PathConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/PathConfig.scala new file mode 100644 index 000000000..36f40c83e --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/config/PathConfig.scala @@ -0,0 +1,18 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.common.config + +case class PathConfig(inputPath: String, outputPath: String) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala index 3fd3dd83a..983c20474 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/common/plugin/PostProcessingService.scala @@ -15,51 +15,12 @@ package za.co.absa.enceladus.common.plugin -import java.time.Instant - import com.typesafe.config.Config import org.apache.log4j.LogManager import org.apache.spark.sql.{DataFrame, SparkSession} -import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams.ErrorSourceId._ import za.co.absa.enceladus.plugins.api.postprocessor.PostProcessor import za.co.absa.enceladus.plugins.builtin.errorsender.params.ErrorSenderPluginParams - -object PostProcessingService { - //scalastyle:off parameter.number - def forStandardization(config: Config, - datasetName: String, - datasetVersion: Int, - reportDate: String, - reportVersion: Int, - outputPath: String, - sourceSystem: String, - runUrls: Option[String], - runId: Option[Int], - uniqueRunId: Option[String], - processingTimestamp: Instant): PostProcessingService = { - val params = ErrorSenderPluginParams(datasetName, datasetVersion, reportDate, reportVersion, outputPath, - Standardization, sourceSystem, runUrls, runId, uniqueRunId, processingTimestamp) - PostProcessingService(config, params) - } - - def forConformance(config: Config, - datasetName: String, - datasetVersion: Int, - reportDate: String, - reportVersion: Int, - outputPath: String, - sourceSystem: String, - runUrls: Option[String], - runId: Option[Int], - uniqueRunId: Option[String], - processingTimestamp: Instant): PostProcessingService = { - val params = ErrorSenderPluginParams(datasetName, datasetVersion, reportDate, reportVersion, outputPath, - Conformance, sourceSystem, runUrls, runId, uniqueRunId, processingTimestamp) - PostProcessingService(config, params) - //scalastyle:on parameter.number - } - -} +import za.co.absa.enceladus.utils.modules.SourcePhase._ case class PostProcessingService private(config: Config, additionalParams: ErrorSenderPluginParams) { diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConfCmdConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConfCmdConfig.scala deleted file mode 100644 index 6e96039e7..000000000 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConfCmdConfig.scala +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright 2018 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.enceladus.conformance - -import org.apache.spark.storage.StorageLevel -import scopt.OptionParser -import za.co.absa.enceladus.dao.auth._ - -import scala.util.matching.Regex - -/** - * This is a class for configuration provided by the command line parameters - * - * Note: scopt requires all fields to have default values. - * Even if a field is mandatory it needs a default value. - */ -case class ConfCmdConfig(datasetName: String = "", - datasetVersion: Int = 1, - reportDate: String = "", - reportVersion: Option[Int] = None, - menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory, - performanceMetricsFile: Option[String] = None, - publishPathOverride: Option[String] = None, - folderPrefix: Option[String] = None, - experimentalMappingRule: Option[Boolean] = None, - isCatalystWorkaroundEnabled: Option[Boolean] = None, - autocleanStandardizedFolder: Option[Boolean] = None, - persistStorageLevel: Option[StorageLevel] = None) - -object ConfCmdConfig { - - def getCmdLineArguments(args: Array[String]): ConfCmdConfig = { - val parser = new CmdParser("spark-submit [spark options] ConformanceBundle.jar") - - val optionCmd = parser.parse(args, ConfCmdConfig()) - if (optionCmd.isEmpty) { - // Wrong arguments provided, the message is already displayed - System.exit(1) - } - optionCmd.get - } - - private class CmdParser(programName: String) extends OptionParser[ConfCmdConfig](programName) { - head("Dynamic Conformance", "") - - opt[String]('D', "dataset-name").required().action((value, config) => - config.copy(datasetName = value)).text("Dataset name") - - opt[Int]('d', "dataset-version").required().action((value, config) => - config.copy(datasetVersion = value)).text("Dataset version") - .validate(value => - if (value > 0) { - success - } else { - failure("Option --dataset-version must be >0") - }) - - val reportDateMatcher: Regex = "^\\d{4}-\\d{2}-\\d{2}$".r - opt[String]('R', "report-date").required().action((value, config) => - config.copy(reportDate = value)).text("Report date in 'yyyy-MM-dd' format") - .validate(value => - reportDateMatcher.findFirstIn(value) match { - case None => failure(s"Match error in '$value'. Option --report-date expects a date in 'yyyy-MM-dd' format") - case _ => success - }) - - opt[Int]('r', "report-version").optional().action((value, config) => - config.copy(reportVersion = Some(value))) - .text("Report version. If not provided, it is inferred based on the publish path (it's an EXPERIMENTAL feature)") - .validate(value => - if (value > 0) { - success - } else { - failure("Option --report-version must be >0") - }) - - private var credsFile: Option[String] = None - private var keytabFile: Option[String] = None - opt[String]("menas-credentials-file").hidden.optional().action({ (file, config) => - credsFile = Some(file) - config.copy(menasCredentialsFactory = new MenasPlainCredentialsFactory(file)) - }).text("Path to Menas credentials config file.").validate(path => - if (keytabFile.isDefined) { - failure("Only one authentication method is allow at a time") - } else { - success - }) - - opt[String]("menas-auth-keytab").optional().action({ (file, config) => - keytabFile = Some(file) - config.copy(menasCredentialsFactory = new MenasKerberosCredentialsFactory(file)) - }).text("Path to keytab file used for authenticating to menas").validate({ file => - if (credsFile.isDefined) { - failure("Only one authentication method is allowed at a time") - } else { - success - } - }) - - opt[String]("performance-file").optional().action((value, config) => - config.copy(performanceMetricsFile = Option(value))).text("Produce a performance metrics file at the given location (local filesystem)") - - opt[String]("debug-set-publish-path").optional().hidden().action((value, config) => - config.copy(publishPathOverride = Option(value))).text("override the path of the published data (used internally for testing)") - - opt[String]("folder-prefix").optional().action((value, config) => - config.copy(folderPrefix = Option(value))).text("Adds a folder prefix before the infoDateColumn") - - opt[Boolean]("experimental-mapping-rule").optional().action((value, config) => - config.copy(experimentalMappingRule = Option(value))).text("Use experimental optimized mapping conformance rule") - - opt[Boolean]("catalyst-workaround").optional().action((value, config) => - config.copy(isCatalystWorkaroundEnabled = Option(value))).text("Turn on or off Catalyst workaround feature. " + - "This overrides 'conformance.catalyst.workaround' configuration value provided in 'application.conf'.") - - opt[Boolean]("autoclean-std-folder").optional().action((value, config) => - config.copy(autocleanStandardizedFolder = Option(value))).text("Deletes standardized data from HDFS once " + - "it is successfully conformed. This overrides 'conformance.autoclean.standardized.hdfs.folder' configuration " + - " value provided in 'application.conf'.") - - opt[String]("persist-storage-level").optional().action((value, config) => - config.copy(persistStorageLevel = Some(StorageLevel.fromString(value)))) - .text("Specifies persistence storage level to use when processing data. Spark's default is MEMORY_AND_DISK.") - - help("help").text("prints this usage text") - - checkConfig { config => - config.menasCredentialsFactory match { - case InvalidMenasCredentialsFactory => failure("No authentication method specified (e.g. --menas-auth-keytab)") - case _ => success - } - } - } - -} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformanceExecution.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformanceExecution.scala new file mode 100644 index 000000000..ae271768c --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformanceExecution.scala @@ -0,0 +1,180 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.conformance + +import java.io.{PrintWriter, StringWriter} + +import org.apache.spark.sql.functions.{lit, to_date} +import org.apache.spark.sql.{DataFrame, SparkSession} +import za.co.absa.atum.AtumImplicits +import za.co.absa.atum.AtumImplicits._ +import za.co.absa.atum.core.Atum +import za.co.absa.enceladus.common.Constants.{InfoDateColumn, InfoDateColumnString, InfoVersionColumn, ReportDateFormat} +import za.co.absa.enceladus.common.RecordIdGeneration._ +import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig} +import za.co.absa.enceladus.common.plugin.menas.MenasPlugin +import za.co.absa.enceladus.common.{CommonJobExecution, Constants, RecordIdGeneration} +import za.co.absa.enceladus.conformance.config.{ConformanceConfig, ConformanceParser} +import za.co.absa.enceladus.conformance.interpreter.rules.ValidationException +import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} +import za.co.absa.enceladus.dao.MenasDAO +import za.co.absa.enceladus.dao.auth.MenasCredentials +import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils +import za.co.absa.enceladus.utils.implicits.DataFrameImplicits.DataFrameEnhancements +import za.co.absa.enceladus.utils.modules.SourcePhase +import za.co.absa.enceladus.utils.performance.PerformanceMetricTools +import za.co.absa.enceladus.utils.schema.SchemaUtils + +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +trait ConformanceExecution extends CommonJobExecution { + private val conformanceReader = new ConformancePropertiesProvider + private val sourceId = SourcePhase.Conformance + + protected def prepareConformance[T](preparationResult: PreparationResult) + (implicit dao: MenasDAO, + cmd: ConformanceParser[T], + fsUtils: FileSystemVersionUtils, + spark: SparkSession + ): Unit = { + // Enable Control Framework + import za.co.absa.atum.AtumImplicits.SparkSessionWrapper + + spark.enableControlMeasuresTracking(s"${preparationResult.pathCfg.inputPath}/_INFO") + .setControlMeasuresWorkflow(sourceId.toString) + + // Enable control framework performance optimization for pipeline-like jobs + Atum.setAllowUnpersistOldDatasets(true) + + // Enable Menas plugin for Control Framework + MenasPlugin.enableMenas( + conf, + cmd.datasetName, + cmd.datasetVersion, + cmd.reportDate, + preparationResult.reportVersion) + } + + protected def readConformanceInputData(pathCfg: PathConfig)(implicit spark: SparkSession): DataFrame = { + spark.read.parquet(pathCfg.inputPath) + } + + protected def conform(inputData: DataFrame, preparationResult: PreparationResult) + (implicit spark: SparkSession, cmd: ConformanceConfig, dao: MenasDAO): DataFrame = { + val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf) + + implicit val featureSwitcher: FeatureSwitches = conformanceReader.readFeatureSwitches() + + Try { + handleControlInfoValidation() + DynamicInterpreter.interpret(preparationResult.dataset, inputData) + } match { + case Failure(e: ValidationException) => + AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(sourceId.toString, e.getMessage, e.techDetails) + throw e + case Failure(NonFatal(e)) => + val sw = new StringWriter + e.printStackTrace(new PrintWriter(sw)) + AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(sourceId.toString, e.getMessage, sw.toString) + throw e + case Success(conformedDF) => + if (SchemaUtils.fieldExists(Constants.EnceladusRecordId, conformedDF.schema)) { + conformedDF // no new id regeneration + } else { + RecordIdGeneration.addRecordIdColumnByStrategy(conformedDF, Constants.EnceladusRecordId, recordIdGenerationStrategy) + } + } + } + + protected def processConformanceResult(args: Array[String], + result: DataFrame, + preparationResult: PreparationResult, + menasCredentials: MenasCredentials) + (implicit spark: SparkSession, + cmd: ConformanceConfig, + fsUtils: FileSystemVersionUtils): Unit = { + val cmdLineArgs: String = args.mkString(" ") + + PerformanceMetricTools.addJobInfoToAtumMetadata( + "conform", + preparationResult.pathCfg.inputPath, + preparationResult.pathCfg.outputPath, + menasCredentials.username, cmdLineArgs + ) + + val withPartCols = result + .withColumnIfDoesNotExist(InfoDateColumn, to_date(lit(cmd.reportDate), ReportDateFormat)) + .withColumnIfDoesNotExist(InfoDateColumnString, lit(cmd.reportDate)) + .withColumnIfDoesNotExist(InfoVersionColumn, lit(preparationResult.reportVersion)) + + val recordCount = result.lastCheckpointRowCount match { + case None => withPartCols.count + case Some(p) => p + } + if (recordCount == 0) { + handleEmptyOutput(SourcePhase.Conformance) + } + + // ensure the whole path but version exists + fsUtils.createAllButLastSubDir(preparationResult.pathCfg.outputPath) + + withPartCols.write.parquet(preparationResult.pathCfg.outputPath) + + val publishDirSize = fsUtils.getDirectorySize(preparationResult.pathCfg.outputPath) + preparationResult.performance.finishMeasurement(publishDirSize, recordCount) + PerformanceMetricTools.addPerformanceMetricsToAtumMetadata( + spark, + "conform", + preparationResult.pathCfg.inputPath, + preparationResult.pathCfg.outputPath, + menasCredentials.username, cmdLineArgs + ) + + withPartCols.writeInfoFile(preparationResult.pathCfg.outputPath) + writePerformanceMetrics(preparationResult.performance, cmd) + + if (conformanceReader.isAutocleanStdFolderEnabled()) { + fsUtils.deleteDirectoryRecursively(preparationResult.pathCfg.inputPath) + } + log.info(s"$sourceId finished successfully") + } + + override protected def getPathCfg[T](cmd: JobConfigParser[T], conformance: Dataset, reportVersion: Int): PathConfig = { + val confCmd = cmd.asInstanceOf[ConformanceParser[T]] + PathConfig( + outputPath = buildPublishPath(confCmd, conformance, reportVersion), + inputPath = getStandardizationPath(cmd, reportVersion) + ) + } + + def buildPublishPath[T](cmd: ConformanceParser[T], + ds: Dataset, + reportVersion: Int): String = { + val infoDateCol: String = InfoDateColumn + val infoVersionCol: String = InfoVersionColumn + + (cmd.publishPathOverride, cmd.folderPrefix) match { + case (None, None) => + s"${ds.hdfsPublishPath}/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion" + case (None, Some(folderPrefix)) => + s"${ds.hdfsPublishPath}/$folderPrefix/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion" + case (Some(publishPathOverride), _) => + publishPathOverride + } + } +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformancePropertiesProvider.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformancePropertiesProvider.scala new file mode 100644 index 000000000..ed6be86a5 --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/ConformancePropertiesProvider.scala @@ -0,0 +1,93 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.conformance + +import com.typesafe.config.{Config, ConfigFactory} +import org.slf4j.{Logger, LoggerFactory} +import za.co.absa.enceladus.conformance.config.ConformanceConfig +import za.co.absa.enceladus.utils.config.ConfigUtils.ConfigImplicits +import za.co.absa.enceladus.conformance.interpreter.{FeatureSwitches, ThreeStateSwitch} +import ConformancePropertiesProvider._ + +/** + * Reads conformance properties from the configuration file + */ +class ConformancePropertiesProvider { + private val enableCF: Boolean = true + private val log: Logger = LoggerFactory.getLogger(this.getClass) + private implicit val conf: Config = ConfigFactory.load() + + def isAutocleanStdFolderEnabled()(implicit cmd: ConformanceConfig): Boolean = { + val enabled = getCmdOrConfigBoolean(cmd.autocleanStandardizedFolder, standardizedHdfsFolderKey, defaultValue = false) + log.info(s"Autoclean standardized HDFS folder = $enabled") + enabled + } + + def readFeatureSwitches()(implicit cmdConfig: ConformanceConfig): FeatureSwitches = FeatureSwitches() + .setExperimentalMappingRuleEnabled(isExperimentalRuleEnabled()) + .setCatalystWorkaroundEnabled(isCatalystWorkaroundEnabled()) + .setControlFrameworkEnabled(enableCF) + .setBroadcastStrategyMode(broadcastingStrategyMode) + .setBroadcastMaxSizeMb(broadcastingMaxSizeMb) + + private def isExperimentalRuleEnabled()(implicit cmd: ConformanceConfig): Boolean = { + val enabled = getCmdOrConfigBoolean(cmd.experimentalMappingRule, experimentalRuleKey, defaultValue = false) + log.info(s"Experimental mapping rule enabled = $enabled") + enabled + } + + private def isCatalystWorkaroundEnabled()(implicit cmd: ConformanceConfig): Boolean = { + val enabled = getCmdOrConfigBoolean(cmd.isCatalystWorkaroundEnabled, catalystWorkaroundKey, defaultValue = true) + log.info(s"Catalyst workaround enabled = $enabled") + enabled + } + + private def broadcastingStrategyMode: ThreeStateSwitch = { + ThreeStateSwitch(conf.getString(broadcastStrategyKey)) + } + + private def broadcastingMaxSizeMb: Int = { + conf.getInt(maxBroadcastSizeKey) + } + + /** + * Returns an effective value of a parameter according to the following priorities: + * - Command line arguments [highest] + * - Configuration file (application.conf) + * - Global default [lowest] + * + * @param cmdParameterOpt An optional value retrieved from command line arguments + * @param configKey A key in a configuration file + * @param defaultValue Global default value + * @return The effective value of the parameter + */ + private def getCmdOrConfigBoolean(cmdParameterOpt: Option[Boolean], configKey: String, defaultValue: Boolean) + (implicit conf: Config): Boolean = { + val enabled = cmdParameterOpt match { + case Some(b) => b + case None => conf.getOptionBoolean(configKey).getOrElse(defaultValue) + } + enabled + } +} + +object ConformancePropertiesProvider { + private val standardizedHdfsFolderKey = "conformance.autoclean.standardized.hdfs.folder" + private val maxBroadcastSizeKey = "conformance.mapping.rule.max.broadcast.size.mb" + private val experimentalRuleKey = "conformance.mapping.rule.experimental.implementation" + private val catalystWorkaroundKey = "conformance.catalyst.workaround" + private val broadcastStrategyKey = "conformance.mapping.rule.broadcast" +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala index a590c28c9..631d3224e 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/DynamicConformanceJob.scala @@ -15,374 +15,38 @@ package za.co.absa.enceladus.conformance -import java.io.{PrintWriter, StringWriter} -import java.text.MessageFormat -import java.time.Instant - -import com.typesafe.config.{Config, ConfigFactory} -import org.apache.spark.sql.functions.{lit, to_date} -import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.apache.spark.{SPARK_VERSION, sql} -import org.slf4j.{Logger, LoggerFactory} -import za.co.absa.atum.AtumImplicits -import za.co.absa.atum.AtumImplicits.{DataSetWrapper, StringToPath} -import za.co.absa.atum.core.Atum -import za.co.absa.enceladus.common.Constants._ -import za.co.absa.enceladus.common.RecordIdGeneration._ -import za.co.absa.enceladus.common.plugin.PostProcessingService -import za.co.absa.enceladus.common.plugin.menas.{MenasPlugin, MenasRunUrl} -import za.co.absa.enceladus.common.version.SparkVersionGuard -import za.co.absa.enceladus.common.{Constants, ControlInfoValidation, RecordIdGeneration} -import za.co.absa.enceladus.conformance.interpreter.rules.ValidationException -import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches, ThreeStateSwitch} +import org.apache.spark.sql.SparkSession +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.dao.MenasDAO -import za.co.absa.enceladus.dao.auth.MenasCredentials -import za.co.absa.enceladus.dao.rest.{MenasConnectionStringParser, RestDaoFactory} -import za.co.absa.enceladus.model.Dataset -import za.co.absa.enceladus.utils.config.ConfigUtils.ConfigImplicits -import za.co.absa.enceladus.utils.config.{ConfigReader, SecureConfig} +import za.co.absa.enceladus.dao.rest.RestDaoFactory import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils -import za.co.absa.enceladus.utils.general.ProjectMetadataTools -import za.co.absa.enceladus.utils.implicits.DataFrameImplicits.DataFrameEnhancements -import za.co.absa.enceladus.utils.performance.{PerformanceMeasurer, PerformanceMetricTools} -import za.co.absa.enceladus.utils.schema.SchemaUtils -import za.co.absa.enceladus.utils.time.TimeZoneNormalizer - -import scala.util.control.NonFatal -import scala.util.{Failure, Success, Try} - -object DynamicConformanceJob { - TimeZoneNormalizer.normalizeJVMTimeZone() +import za.co.absa.enceladus.utils.modules.SourcePhase - private val log: Logger = LoggerFactory.getLogger(this.getClass) - private val conf: Config = ConfigFactory.load() - private val confReader: ConfigReader = new ConfigReader(conf) - private val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) +object DynamicConformanceJob extends ConformanceExecution { def main(args: Array[String]) { // This should be the first thing the app does to make secure Kafka work with our CA. // After Spring activates JavaX, it will be too late. - SecureConfig.setSecureKafkaProperties(conf) - - SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION) - - confReader.logEffectiveConfigProps(Constants.ConfigKeysToRedact) + initialValidation() - implicit val cmd: ConfCmdConfig = ConfCmdConfig.getCmdLineArguments(args) + implicit val cmd: ConformanceConfig = ConformanceConfig.getFromArguments(args) implicit val spark: SparkSession = obtainSparkSession() // initialize spark implicit val fsUtils: FileSystemVersionUtils = new FileSystemVersionUtils(spark.sparkContext.hadoopConfiguration) val menasCredentials = cmd.menasCredentialsFactory.getInstance() implicit val dao: MenasDAO = RestDaoFactory.getInstance(menasCredentials, menasBaseUrls) - val enableCF: Boolean = true - - dao.authenticate() - - // get the dataset definition - val conformance = dao.getDataset(cmd.datasetName, cmd.datasetVersion) - val dateTokens = cmd.reportDate.split("-") - - val reportVersion = cmd.reportVersion match { - case Some(version) => version - case None => inferVersion(conformance.hdfsPublishPath, cmd.reportDate) - } - - val pathCfg = PathCfg( - publishPath = buildPublishPath(InfoDateColumn, InfoVersionColumn, cmd, conformance, reportVersion), - stdPath = MessageFormat.format(conf.getString("standardized.hdfs.path"), cmd.datasetName, - cmd.datasetVersion.toString, cmd.reportDate, reportVersion.toString) - ) - val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf) - - log.info(s"stdpath = ${pathCfg.stdPath}, publishPath = ${pathCfg.publishPath}") - // die before performing any computation if the output path already exists - if (fsUtils.hdfsExists(pathCfg.publishPath)) { - throw new IllegalStateException( - s"Path ${pathCfg.publishPath} already exists. Increment the run version, or delete ${pathCfg.publishPath}") - } - - initFunctionalExtensions(reportVersion, pathCfg) - val performance = initPerformanceMeasurer(pathCfg.stdPath) - - // load data for input and mapping tables - val inputData = spark.read.parquet(pathCfg.stdPath) + val preparationResult = prepareJob() + prepareConformance(preparationResult) + val inputData = readConformanceInputData(preparationResult.pathCfg) try { - val result = conform(conformance, inputData, enableCF, recordIdGenerationStrategy) - - PerformanceMetricTools.addJobInfoToAtumMetadata("conform", - pathCfg.stdPath, pathCfg.publishPath, menasCredentials.username, args.mkString(" ")) + val result = conform(inputData, preparationResult) - processResult(result, performance, pathCfg, reportVersion, args.mkString(" "), menasCredentials) - log.info("Conformance finished successfully") - - // read written data from parquet directly - val conformedDf = spark.read.parquet(pathCfg.publishPath) - val postProcessingService = getPostProcessingService(cmd, pathCfg, reportVersion, MenasPlugin.runNumber, Atum.getControlMeasure.runUniqueId) - postProcessingService.onSaveOutput(conformedDf) // all enabled postProcessors will be run with the std df + processConformanceResult(args, result, preparationResult, menasCredentials) + runPostProcessing(SourcePhase.Conformance, preparationResult, cmd) } finally { - - MenasPlugin.runNumber.foreach { runNumber => - val name = cmd.datasetName - val version = cmd.datasetVersion - menasBaseUrls.foreach { menasBaseUrl => - log.info(s"Menas API Run URL: $menasBaseUrl/api/runs/$name/$version/$runNumber") - log.info(s"Menas UI Run URL: $menasBaseUrl/#/runs/$name/$version/$runNumber") - } - } - } - } - - private def getPostProcessingService(cmd: ConfCmdConfig, pathCfg: PathCfg, reportVersion: Int, - runNumber: Option[Int], uniqueRunId: Option[String] - )(implicit fsUtils: FileSystemVersionUtils): PostProcessingService = { - val runId = MenasPlugin.runNumber - - if (runId.isEmpty) { - log.warn("No run number found, the Run URL cannot be properly reported!") - } - - // reporting the UI url(s) - if more than one, its comma-separated - val runUrl: Option[String] = runId.map { runNumber => - menasBaseUrls.map { menasBaseUrl => - MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, cmd.datasetName, cmd.datasetVersion, runNumber) - }.mkString(",") - } - - PostProcessingService.forConformance(conf, cmd.datasetName, cmd.datasetVersion, cmd.reportDate, - reportVersion, pathCfg.publishPath, Atum.getControlMeasure.metadata.sourceApplication, runUrl, - runId, uniqueRunId, Instant.now) - } - - private def isExperimentalRuleEnabled()(implicit cmd: ConfCmdConfig): Boolean = { - val enabled = getCmdOrConfigBoolean(cmd.experimentalMappingRule, - "conformance.mapping.rule.experimental.implementation", - defaultValue = false) - log.info(s"Experimental mapping rule enabled = $enabled") - enabled - } - - private def isCatalystWorkaroundEnabled()(implicit cmd: ConfCmdConfig): Boolean = { - val enabled = getCmdOrConfigBoolean(cmd.isCatalystWorkaroundEnabled, - "conformance.catalyst.workaround", - defaultValue = true) - log.info(s"Catalyst workaround enabled = $enabled") - enabled - } - - private def isAutocleanStdFolderEnabled()(implicit cmd: ConfCmdConfig): Boolean = { - val enabled = getCmdOrConfigBoolean(cmd.autocleanStandardizedFolder, - "conformance.autoclean.standardized.hdfs.folder", - defaultValue = false) - log.info(s"Autoclean standardized HDFS folder = $enabled") - enabled - } - - private def broadcastingStrategyMode: ThreeStateSwitch = { - ThreeStateSwitch(conf.getString("conformance.mapping.rule.broadcast")) - } - - private def broadcastingMaxSizeMb: Int = { - conf.getInt("conformance.mapping.rule.max.broadcast.size.mb") - } - - /** - * Returns an effective value of a parameter according to the following priorities: - * - Command line arguments [highest] - * - Configuration file (application.conf) - * - Global default [lowest] - * - * @param cmdParameterOpt An optional value retrieved from command line arguments - * @param configKey A key in a configuration file - * @param defaultValue Global default value - * @return The effective value of the parameter - */ - private def getCmdOrConfigBoolean(cmdParameterOpt: Option[Boolean], - configKey: String, - defaultValue: Boolean): Boolean = { - val enabled = cmdParameterOpt match { - case Some(b) => b - case None => - conf.getOptionBoolean(configKey).getOrElse(defaultValue) - } - enabled - } - - private def obtainSparkSession()(implicit cmd: ConfCmdConfig): SparkSession = { - val enceladusVersion = ProjectMetadataTools.getEnceladusVersion - log.info(s"Enceladus version $enceladusVersion") - val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("") - val spark: SparkSession = SparkSession.builder() - .appName(s"Dynamic Conformance $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion") - .getOrCreate() - - TimeZoneNormalizer.normalizeSessionTimeZone(spark) - spark - } - - private def inferVersion(hdfsPublishPath: String, reportDate: String) - (implicit fsUtils: FileSystemVersionUtils):Int = { - val newVersion = fsUtils.getLatestVersion(hdfsPublishPath, reportDate) + 1 - log.warn(s"Report version not provided, inferred report version: $newVersion") - log.warn("This is an EXPERIMENTAL feature.") - log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.") - log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.") - newVersion - } - - private def initFunctionalExtensions(reportVersion: Int, pathCfg: PathCfg)(implicit spark: SparkSession, - dao: MenasDAO, - cmd: ConfCmdConfig): Unit = { - // Enable Spline - import za.co.absa.spline.core.SparkLineageInitializer._ - spark.enableLineageTracking() - - // Enable Control Framework - import za.co.absa.atum.AtumImplicits.SparkSessionWrapper - spark.enableControlMeasuresTracking(s"${pathCfg.stdPath}/_INFO") - .setControlMeasuresWorkflow("Conformance") - - // Enable control framework performance optimization for pipeline-like jobs - Atum.setAllowUnpersistOldDatasets(true) - - // Enable non-default persistence storage level if provided in the command line - cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel) - - // Enable Menas plugin for Control Framework - MenasPlugin.enableMenas(conf, cmd.datasetName, cmd.datasetVersion, cmd.reportDate, reportVersion) - } - - private def initPerformanceMeasurer(stdPath: String) - (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils): PerformanceMeasurer = { - // init performance measurer - val performance = new PerformanceMeasurer(spark.sparkContext.appName) - val stdDirSize = fsUtils.getDirectorySize(stdPath) - performance.startMeasurement(stdDirSize) - performance - } - - private def conform(conformance: Dataset, inputData: sql.Dataset[Row], enableCF: Boolean, recordIdGenerationStrategy: IdType) - (implicit spark: SparkSession, cmd: ConfCmdConfig, fsUtils: FileSystemVersionUtils, dao: MenasDAO): DataFrame = { - implicit val featureSwitcher: FeatureSwitches = FeatureSwitches() - .setExperimentalMappingRuleEnabled(isExperimentalRuleEnabled()) - .setCatalystWorkaroundEnabled(isCatalystWorkaroundEnabled()) - .setControlFrameworkEnabled(enableCF) - .setBroadcastStrategyMode(broadcastingStrategyMode) - .setBroadcastMaxSizeMb(broadcastingMaxSizeMb) - - Try { - handleControlInfoValidation() - DynamicInterpreter.interpret(conformance, inputData) - } match { - case Failure(e: ValidationException) => - AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Conformance", e.getMessage, e.techDetails) - throw e - case Failure(NonFatal(e)) => - val sw = new StringWriter - e.printStackTrace(new PrintWriter(sw)) - AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Conformance", e.getMessage, sw.toString) - throw e - case Success(conformedDF) => - if (SchemaUtils.fieldExists(Constants.EnceladusRecordId, conformedDF.schema)) { - conformedDF // no new id regeneration - } else { - RecordIdGeneration.addRecordIdColumnByStrategy(conformedDF, Constants.EnceladusRecordId, recordIdGenerationStrategy) - } - + finishJob(cmd) } } - - private def processResult(result: DataFrame, - performance: PerformanceMeasurer, - pathCfg: PathCfg, - reportVersion: Int, - cmdLineArgs: String, - menasCredentials: MenasCredentials) - (implicit spark: SparkSession, cmd: ConfCmdConfig, fsUtils: FileSystemVersionUtils): Unit = { - val withPartCols = result - .withColumnIfDoesNotExist(InfoDateColumn, to_date(lit(cmd.reportDate), ReportDateFormat)) - .withColumnIfDoesNotExist(InfoDateColumnString, lit(cmd.reportDate)) - .withColumnIfDoesNotExist(InfoVersionColumn, lit(reportVersion)) - - val recordCount = result.lastCheckpointRowCount match { - case None => withPartCols.count - case Some(p) => p - } - if (recordCount == 0) { handleEmptyOutputAfterConformance() } - - // ensure the whole path but version exists - fsUtils.createAllButLastSubDir(pathCfg.publishPath) - - withPartCols.write.parquet(pathCfg.publishPath) - - val publishDirSize = fsUtils.getDirectorySize(pathCfg.publishPath) - performance.finishMeasurement(publishDirSize, recordCount) - PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(spark, "conform", - pathCfg.stdPath, pathCfg.publishPath, menasCredentials.username, cmdLineArgs) - - withPartCols.writeInfoFile(pathCfg.publishPath) - cmd.performanceMetricsFile.foreach(fileName => { - try { - performance.writeMetricsToFile(fileName) - } catch { - case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}") - } - }) - - if (isAutocleanStdFolderEnabled()) { - fsUtils.deleteDirectoryRecursively(pathCfg.stdPath) - } - } - - private def handleEmptyOutputAfterConformance()(implicit spark: SparkSession): Unit = { - import za.co.absa.atum.core.Constants._ - - val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints - .flatMap(checkpoint => - checkpoint.controls.filter(control => - control.controlName.equalsIgnoreCase(controlTypeRecordCount))) - .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D)) - - if (areCountMeasurementsAllZero) { - log.warn("Empty output after running Dynamic Conformance. Previous checkpoints show this is correct.") - } else { - val errMsg = "Empty output after running Dynamic Conformance, " + - "while previous checkpoints show non zero record count" - AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Standardization", errMsg, "") - throw new IllegalStateException(errMsg) - } - } - - private def handleControlInfoValidation(): Unit = { - ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match { - case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) => { - val confEntry = "control.info.validation" - conf.getString(confEntry) match { - case "strict" => throw ex - case "warning" => log.warn(ex.msg) - case "none" => - case _ => throw new RuntimeException(s"Invalid $confEntry value") - } - } - case Failure(ex) => throw ex - case Success(_) => - } - } - - def buildPublishPath(infoDateCol: String, - infoVersionCol: String, - cmd: ConfCmdConfig, - ds: Dataset, - reportVersion: Int): String = { - (cmd.publishPathOverride, cmd.folderPrefix) match { - case (None, None) => - s"${ds.hdfsPublishPath}/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion" - case (None, Some(folderPrefix)) => - s"${ds.hdfsPublishPath}/$folderPrefix/$infoDateCol=${cmd.reportDate}/$infoVersionCol=$reportVersion" - case (Some(publishPathOverride), _) => - publishPathOverride - } - } - - private final case class PathCfg(publishPath: String, stdPath: String) } + diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala index 61a915830..255cf880a 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/HyperConformance.scala @@ -21,11 +21,11 @@ import java.util.Date import org.apache.commons.configuration2.Configuration import org.apache.spark.SPARK_VERSION import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.DateType import org.apache.spark.sql.{DataFrame, SparkSession} import org.slf4j.{Logger, LoggerFactory} import za.co.absa.enceladus.common.Constants._ import za.co.absa.enceladus.common.version.SparkVersionGuard +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{Always, DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.conformance.streaming.InfoDateFactory import za.co.absa.enceladus.dao.MenasDAO @@ -34,7 +34,7 @@ import za.co.absa.enceladus.dao.rest.{MenasConnectionStringParser, RestDaoFactor import za.co.absa.enceladus.model.Dataset import za.co.absa.hyperdrive.ingestor.api.transformer.{StreamTransformer, StreamTransformerFactory} -class HyperConformance (implicit cmd: ConfCmdConfig, +class HyperConformance (implicit cmd: ConformanceConfig, featureSwitches: FeatureSwitches, menasBaseUrls: List[String], infoDateFactory: InfoDateFactory) extends StreamTransformer { @@ -77,7 +77,7 @@ class HyperConformance (implicit cmd: ConfCmdConfig, } @throws[IllegalArgumentException] - private def getReportVersion(implicit cmd: ConfCmdConfig): Int = { + private def getReportVersion(implicit cmd: ConformanceConfig): Int = { cmd.reportVersion match { case Some(version) => version case None => throw new IllegalArgumentException("Report version is not provided.") @@ -121,19 +121,14 @@ object HyperConformance extends StreamTransformerFactory with HyperConformanceAt val menasCredentialsFactory = getMenasCredentialsFactory(conf: Configuration) - implicit val cmd: ConfCmdConfig = ConfCmdConfig( - datasetName = conf.getString(datasetNameKey), - datasetVersion = conf.getInt(datasetVersionKey), - reportDate = new SimpleDateFormat(ReportDateFormat).format(new Date()), // Still need a report date for mapping table patterns - reportVersion = Option(getReportVersion(conf)), - menasCredentialsFactory = menasCredentialsFactory, - performanceMetricsFile = None, - publishPathOverride = None, - folderPrefix = None, + implicit val confConfig: ConformanceConfig = ConformanceConfig(publishPathOverride = None, experimentalMappingRule = Some(true), isCatalystWorkaroundEnabled = Some(true), autocleanStandardizedFolder = Some(false), - persistStorageLevel = None + datasetName = conf.getString(datasetNameKey), + datasetVersion = conf.getInt(datasetVersionKey), + reportDate = new SimpleDateFormat(ReportDateFormat).format(new Date()), + menasCredentialsFactory = menasCredentialsFactory ) implicit val featureSwitcher: FeatureSwitches = FeatureSwitches() @@ -172,13 +167,4 @@ object HyperConformance extends StreamTransformerFactory with HyperConformanceAt case (true, true) => throw new IllegalArgumentException("Either a credentials file or a keytab should be specified, but not both.") } } - - private def getReportVersion(conf: Configuration): Int = { - if (conf.containsKey(reportVersionKey)) { - conf.getInt(reportVersionKey) - } else { - defaultReportVersion - } - } } - diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceConfig.scala new file mode 100644 index 000000000..4224f871f --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceConfig.scala @@ -0,0 +1,87 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.conformance.config + +import org.apache.spark.storage.StorageLevel +import scopt.OParser +import za.co.absa.enceladus.common.config.{ConfigError, JobConfigParser} +import za.co.absa.enceladus.dao.auth.{InvalidMenasCredentialsFactory, MenasCredentialsFactory} + +import scala.util.Try + + +/** + * This is a class for configuration provided by the command line parameters + * + * Note: scopt requires all fields to have default values. + * Even if a field is mandatory it needs a default value. + */ +case class ConformanceConfig(datasetName: String = "", + datasetVersion: Int = 1, + reportDate: String = "", + reportVersion: Option[Int] = None, + menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory, + performanceMetricsFile: Option[String] = None, + folderPrefix: Option[String] = None, + persistStorageLevel: Option[StorageLevel] = None, + publishPathOverride: Option[String] = None, + experimentalMappingRule: Option[Boolean] = None, + isCatalystWorkaroundEnabled: Option[Boolean] = None, + autocleanStandardizedFolder: Option[Boolean] = None, + credsFile: Option[String] = None, + keytabFile: Option[String] = None) + extends ConformanceParser[ConformanceConfig] { + + override def withPublishPathOverride(value: Option[String]): ConformanceConfig = copy(publishPathOverride = value) + override def withExperimentalMappingRule(value: Option[Boolean]): ConformanceConfig = copy(experimentalMappingRule = value) + override def withIsCatalystWorkaroundEnabled(value: Option[Boolean]): ConformanceConfig = + copy(isCatalystWorkaroundEnabled = value) + override def withAutocleanStandardizedFolder(value: Option[Boolean]): ConformanceConfig = + copy(autocleanStandardizedFolder = value) + override def withDatasetName(value: String): ConformanceConfig = copy(datasetName = value) + override def withDatasetVersion(value: Int): ConformanceConfig = copy(datasetVersion = value) + override def withReportDate(value: String): ConformanceConfig = copy(reportDate = value) + override def withReportVersion(value: Option[Int]): ConformanceConfig = copy(reportVersion = value) + override def withCredsFile(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): ConformanceConfig = + copy(credsFile = value, menasCredentialsFactory = menasCredentialsFactory) + + override def withAuthKeytab(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): ConformanceConfig = + copy(keytabFile = value, menasCredentialsFactory = menasCredentialsFactory) + + override def withPerformanceMetricsFile(value: Option[String]): ConformanceConfig = copy(performanceMetricsFile = value) + override def withFolderPrefix(value: Option[String]): ConformanceConfig = copy(folderPrefix = value) + override def withPersistStorageLevel(value: Option[StorageLevel]): ConformanceConfig = copy(persistStorageLevel = value) +} + +object ConformanceConfig { + def tryFromArguments(args: Array[String]): Try[ConformanceConfig] = { + import za.co.absa.enceladus.utils.implicits.OptionImplicits._ + OParser.parse(conformanceJobParser, args, ConformanceConfig()).toTry(ConfigError("Command line parameters error")) + } + + def getFromArguments(args: Array[String]): ConformanceConfig = tryFromArguments(args).get + + val conformanceJobParser: OParser[_, ConformanceConfig] = { + val builder = OParser.builder[ConformanceConfig] + import builder._ + OParser.sequence( + programName("Conformance Job"), + ConformanceParser.conformanceParser, + JobConfigParser.jobConfigParser + ) + } + +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceParser.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceParser.scala new file mode 100644 index 000000000..4e3d05680 --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/config/ConformanceParser.scala @@ -0,0 +1,58 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.conformance.config + +import scopt.OParser +import za.co.absa.enceladus.common.config.JobConfigParser + +trait ConformanceParser[R] extends JobConfigParser[R] { + def publishPathOverride: Option[String] + def experimentalMappingRule: Option[Boolean] + def isCatalystWorkaroundEnabled: Option[Boolean] + def autocleanStandardizedFolder: Option[Boolean] + + def withPublishPathOverride(vlue: Option[String]): R + def withExperimentalMappingRule(value: Option[Boolean]): R + def withIsCatalystWorkaroundEnabled(value: Option[Boolean]): R + def withAutocleanStandardizedFolder(value: Option[Boolean]): R +} + +object ConformanceParser { + + def conformanceParser[R <: ConformanceParser[R]]: OParser[_, R] = { + val builder = OParser.builder[R] + import builder._ + OParser.sequence( + head("Dynamic Conformance", ""), + + opt[String]("debug-set-publish-path").optional().hidden().action((value, config) => + config.withPublishPathOverride(Some(value))).text("override the path of the published data (used internally for testing)"), + + opt[Boolean]("experimental-mapping-rule").optional().action((value, config) => + config.withExperimentalMappingRule(Option(value))).text("Use experimental optimized mapping conformance rule"), + + opt[Boolean]("catalyst-workaround").optional().action((value, config) => + config.withIsCatalystWorkaroundEnabled(Some(value))).text("Turn on or off Catalyst workaround feature. " + + "This overrides 'conformance.catalyst.workaround' configuration value provided in 'application.conf'."), + + opt[Boolean]("autoclean-std-folder").optional().action((value, config) => + config.withAutocleanStandardizedFolder(Option(value))).text("Deletes standardized data from HDFS once " + + "it is successfully conformed. This overrides 'conformance.autoclean.standardized.hdfs.folder' configuration " + + " value provided in 'application.conf'.") + ) + } +} + diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala index 10cc11e5f..34f55bb72 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/DynamicInterpreter.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.LoggerFactory import za.co.absa.atum.AtumImplicits._ -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.PartitioningUtils import za.co.absa.enceladus.conformance.interpreter.rules._ import za.co.absa.enceladus.conformance.interpreter.rules.custom.CustomConformanceRule @@ -51,7 +51,7 @@ object DynamicInterpreter { * */ def interpret(conformance: ConfDataset, inputDf: Dataset[Row], jobShortName: String = "Conformance") - (implicit spark: SparkSession, dao: MenasDAO, progArgs: ConfCmdConfig, featureSwitches: FeatureSwitches): DataFrame = { + (implicit spark: SparkSession, dao: MenasDAO, progArgs: ConformanceConfig, featureSwitches: FeatureSwitches): DataFrame = { implicit val interpreterContext: InterpreterContext = InterpreterContext(inputDf.schema, conformance, featureSwitches, jobShortName, spark, dao, progArgs) @@ -76,7 +76,7 @@ object DynamicInterpreter { (implicit ictx: InterpreterContext): DataFrame = { implicit val spark: SparkSession = ictx.spark implicit val dao: MenasDAO = ictx.dao - implicit val progArgs: ConfCmdConfig = ictx.progArgs + implicit val progArgs: ConformanceConfig = ictx.progArgs implicit val udfLib: UDFLibrary = new UDFLibrary implicit val explosionState: ExplosionState = new ExplosionState() @@ -266,7 +266,8 @@ object DynamicInterpreter { val fsUtils = new FileSystemVersionUtils(ictx.spark.sparkContext.hadoopConfiguration) val mappingTableDef = ictx.dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion) - val mappingTablePath = PartitioningUtils.getPartitionedPathName(mappingTableDef.hdfsPath, ictx.progArgs.reportDate) + val mappingTablePath = PartitioningUtils.getPartitionedPathName(mappingTableDef.hdfsPath, + ictx.progArgs.reportDate) val mappingTableSize = fsUtils.getDirectorySizeNoHidden(mappingTablePath) (mappingTableSize / (1024 * 1024)).toInt } diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala index 99cbd648a..87c0319cf 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterContext.scala @@ -17,7 +17,7 @@ package za.co.absa.enceladus.conformance.interpreter import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.{Dataset => ConfDataset} @@ -29,5 +29,5 @@ case class InterpreterContext ( jobShortName: String, spark: SparkSession, dao: MenasDAO, - progArgs: ConfCmdConfig + progArgs: ConformanceConfig ) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala index ad7c4f066..7d13b05ec 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayCollapseInterpreter.scala @@ -16,7 +16,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.{Dataset, Row, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.ConformanceRule @@ -31,7 +31,7 @@ class ArrayCollapseInterpreter extends RuleInterpreter { override def conformanceRule: Option[ConformanceRule] = None override def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { val dfOut = ExplodeTools.revertAllExplosions(df, explosionState.explodeContext, Some(ErrorMessage.errorColumnName)) explosionState.explodeContext = ExplosionContext() dfOut diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala index 09edc7453..92356ad19 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ArrayExplodeInterpreter.scala @@ -16,7 +16,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.{Dataset, Row, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.ConformanceRule @@ -29,7 +29,7 @@ class ArrayExplodeInterpreter(columnName: String) extends RuleInterpreter { override def conformanceRule: Option[ConformanceRule] = None override def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { val (dfOut, ctx) = ExplodeTools.explodeAllArraysInPath(columnName, df, explosionState.explodeContext) explosionState.explodeContext = ctx dfOut diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala index 79a20a58e..a505f92d5 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleInterpreter.scala @@ -19,8 +19,8 @@ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{CastingConformanceRule, ConformanceRule} @@ -35,7 +35,7 @@ case class CastingRuleInterpreter(rule: CastingConformanceRule) extends RuleInte override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { // Validate the rule parameters RuleValidators.validateInputField(progArgs.datasetName, ruleName, df.schema, rule.inputColumn) RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala index 2cf9a0200..8d966f5f4 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/ConcatenationRuleInterpreter.scala @@ -18,8 +18,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConcatenationConformanceRule, ConformanceRule} @@ -31,7 +31,7 @@ case class ConcatenationRuleInterpreter(rule: ConcatenationConformanceRule) exte override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { // Validate the rule parameters RuleValidators.validateSameParent(progArgs.datasetName, ruleName, rule.inputColumns :+ rule.outputColumn: _*) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala index e19354359..8600b7af9 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/DropRuleInterpreter.scala @@ -16,8 +16,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.{Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, DropConformanceRule} @@ -28,7 +28,7 @@ case class DropRuleInterpreter(rule: DropConformanceRule) extends RuleInterprete override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { if (SchemaUtils.fieldExists(rule.outputColumn, df.schema)) { if (rule.outputColumn.contains('.')) { conformNestedField(df) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala index a48482818..436b4ec58 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/LiteralRuleInterpreter.scala @@ -16,8 +16,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.{Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, LiteralConformanceRule} @@ -29,7 +29,7 @@ case class LiteralRuleInterpreter(rule: LiteralConformanceRule) extends RuleInte override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { // Validate the rule parameters RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala index 688d4d880..dabdb4d89 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreter.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql.api.java.UDF1 import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Dataset, Row, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO @@ -40,7 +40,7 @@ case class MappingRuleInterpreter(rule: MappingConformanceRule, conformance: Con override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { log.info(s"Processing mapping rule to conform ${rule.outputColumn}...") import spark.implicits._ diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala index d42ee8698..3b3712d84 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterBroadcast.scala @@ -16,7 +16,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.{Dataset, Row, SparkSession} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.dao.MenasDAO @@ -31,7 +31,7 @@ case class MappingRuleInterpreterBroadcast(rule: MappingConformanceRule, conform override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { log.info(s"Processing mapping rule to conform ${rule.outputColumn} (broadcast strategy)...") val mappingTableDef = dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala index 7cf4389c4..8d66c20b2 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/MappingRuleInterpreterGroupExplode.scala @@ -18,7 +18,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.conformance.interpreter.rules.MappingRuleInterpreterGroupExplode._ import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} @@ -42,7 +42,7 @@ case class MappingRuleInterpreterGroupExplode(rule: MappingConformanceRule, override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { log.info(s"Processing mapping rule (explode-optimized) to conform ${rule.outputColumn}...") val mappingTableDef = dao.getMappingTable(rule.mappingTable, rule.mappingTableVersion) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala index 37adc6b25..95ac3288d 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleInterpreter.scala @@ -18,8 +18,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, NegationConformanceRule} @@ -34,7 +34,7 @@ case class NegationRuleInterpreter(rule: NegationConformanceRule) extends RuleIn override def conformanceRule: Option[ConformanceRule] = Some(rule) override def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { NegationRuleInterpreter.validateInputField(progArgs.datasetName, df.schema, rule.inputColumn) val field = SchemaUtils.getField(rule.inputColumn, df.schema).get diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala index 02eb840ab..f87744144 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/RuleInterpreter.scala @@ -18,7 +18,7 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, Dataset, Row, SparkSession} import org.slf4j.{Logger, LoggerFactory} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.ConformanceRule @@ -45,7 +45,7 @@ trait RuleInterpreter { * @return A conformed DataFrame */ def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] protected val log: Logger = LoggerFactory.getLogger(this.getClass) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala index f44b392dd..4de101522 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SingleColumnRuleInterpreter.scala @@ -17,8 +17,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, SingleColumnConformanceRule} @@ -30,7 +30,7 @@ case class SingleColumnRuleInterpreter(rule: SingleColumnConformanceRule) extend override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { // Validate the rule parameters RuleValidators.validateFieldExistence(progArgs.datasetName,ruleName, df.schema, rule.inputColumn) RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala index 29a0ed8fb..9f8384c48 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/SparkSessionConfRuleInterpreter.scala @@ -16,8 +16,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql.{Dataset, Row, SparkSession} +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, SparkSessionConfConformanceRule} @@ -29,7 +29,7 @@ case class SparkSessionConfRuleInterpreter(rule: SparkSessionConfConformanceRule override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { // Validate the rule parameters RuleValidators.validateOutputField(ruleName, progArgs.datasetName, df.schema, rule.outputColumn) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala index 8e611aecc..ba39dd598 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/conformance/interpreter/rules/UppercaseRuleInterpreter.scala @@ -17,8 +17,8 @@ package za.co.absa.enceladus.conformance.interpreter.rules import org.apache.spark.sql._ import org.apache.spark.sql.functions._ +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.spark.hats.Extensions._ -import za.co.absa.enceladus.conformance.ConfCmdConfig import za.co.absa.enceladus.conformance.interpreter.{ExplosionState, RuleValidators} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, UppercaseConformanceRule} @@ -30,7 +30,7 @@ case class UppercaseRuleInterpreter(rule: UppercaseConformanceRule) extends Rule override def conformanceRule: Option[ConformanceRule] = Some(rule) def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { // Validate the rule parameters RuleValidators.validateInputField(progArgs.datasetName, ruleName, df.schema, rule.inputColumn) RuleValidators.validateOutputField(progArgs.datasetName, ruleName, df.schema, rule.outputColumn) diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationExecution.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationExecution.scala new file mode 100644 index 000000000..a39085688 --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationExecution.scala @@ -0,0 +1,260 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.standardization + +import java.io.{PrintWriter, StringWriter} +import java.util.UUID + +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import za.co.absa.atum.AtumImplicits +import za.co.absa.atum.core.Atum +import za.co.absa.enceladus.common.RecordIdGeneration.getRecordIdGenerationStrategyFromConfig +import za.co.absa.enceladus.common.config.{JobConfigParser, PathConfig} +import za.co.absa.enceladus.common.plugin.menas.MenasPlugin +import za.co.absa.enceladus.common.{CommonJobExecution, Constants} +import za.co.absa.enceladus.dao.MenasDAO +import za.co.absa.enceladus.dao.auth.MenasCredentials +import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.{StandardizationParser, StandardizationConfig} +import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter +import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator +import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils +import za.co.absa.enceladus.utils.modules.SourcePhase +import za.co.absa.enceladus.utils.performance.PerformanceMetricTools +import za.co.absa.enceladus.utils.schema.{MetadataKeys, SchemaUtils, SparkUtils} +import za.co.absa.enceladus.utils.udf.UDFLibrary +import za.co.absa.enceladus.utils.validation.ValidationException + +import scala.util.control.NonFatal + +trait StandardizationExecution extends CommonJobExecution { + private val sourceId = SourcePhase.Standardization + + protected def prepareStandardization[T](args: Array[String], + menasCredentials: MenasCredentials, + preparationResult: PreparationResult + ) + (implicit dao: MenasDAO, + cmd: StandardizationParser[T], + fsUtils: FileSystemVersionUtils, + spark: SparkSession): StructType = { + + // Enable Control Framework + import za.co.absa.atum.AtumImplicits.SparkSessionWrapper + spark.enableControlMeasuresTracking(s"${preparationResult.pathCfg.inputPath}/_INFO") + .setControlMeasuresWorkflow(sourceId.toString) + + // Enable control framework performance optimization for pipeline-like jobs + Atum.setAllowUnpersistOldDatasets(true) + + // Enable Menas plugin for Control Framework + MenasPlugin.enableMenas( + conf, + cmd.datasetName, + cmd.datasetVersion, + cmd.reportDate, + preparationResult.reportVersion) + + // Add report date and version (aka Enceladus info date and version) to Atum's metadata + Atum.setAdditionalInfo(Constants.InfoDateColumn -> cmd.reportDate) + Atum.setAdditionalInfo(Constants.InfoVersionColumn -> preparationResult.reportVersion.toString) + + // Add the raw format of the input file(s) to Atum's metadata + Atum.setAdditionalInfo("raw_format" -> cmd.rawFormat) + + PerformanceMetricTools.addJobInfoToAtumMetadata("std", preparationResult.pathCfg.inputPath, preparationResult.pathCfg.outputPath, + menasCredentials.username, args.mkString(" ")) + + dao.getSchema(preparationResult.dataset.schemaName, preparationResult.dataset.schemaVersion) + } + + protected def readStandardizationInputData(schema: StructType, + cmd: StandardizationConfig, + path: String, + dataset: Dataset) + (implicit spark: SparkSession, + fsUtils: FileSystemVersionUtils, + dao: MenasDAO): DataFrame = { + val numberOfColumns = schema.fields.length + val standardizationReader = new StandardizationPropertiesProvider() + val dfReaderConfigured = standardizationReader.getFormatSpecificReader(cmd, dataset, numberOfColumns) + val readerWithOptSchema = cmd.rawFormat.toLowerCase() match { + case "parquet" | "cobol" => dfReaderConfigured + case _ => + val optColumnNameOfCorruptRecord = getColumnNameOfCorruptRecord(schema, cmd) + val inputSchema = PlainSchemaGenerator.generateInputSchema(schema, optColumnNameOfCorruptRecord) + dfReaderConfigured.schema(inputSchema) + } + val dfWithSchema = readerWithOptSchema.load(s"$path/*") + + ensureSplittable(dfWithSchema, path, schema) + } + + + private def getColumnNameOfCorruptRecord[R](schema: StructType, cmd: StandardizationParser[R]) + (implicit spark: SparkSession): Option[String] = { + // SparkUtils.setUniqueColumnNameOfCorruptRecord is called even if result is not used to avoid conflict + val columnNameOfCorruptRecord = SparkUtils.setUniqueColumnNameOfCorruptRecord(spark, schema) + if (cmd.rawFormat.equalsIgnoreCase("fixed-width") || cmd.failOnInputNotPerSchema) { + None + } else { + Option(columnNameOfCorruptRecord) + } + } + + protected def standardize(inputData: DataFrame, schema: StructType, cmd: StandardizationConfig) + (implicit spark: SparkSession, udfLib: UDFLibrary): DataFrame = { + //scalastyle:on parameter.number + val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf) + + try { + handleControlInfoValidation() + StandardizationInterpreter.standardize(inputData, schema, cmd.rawFormat, + cmd.failOnInputNotPerSchema, recordIdGenerationStrategy) + } catch { + case e@ValidationException(msg, errors) => + val errorDescription = s"$msg\nDetails: ${errors.mkString("\n")}" + AtumImplicits.SparkSessionWrapper(spark) + .setControlMeasurementError("Schema Validation", errorDescription, "") + throw e + case NonFatal(e) if !e.isInstanceOf[ValidationException] => + val sw = new StringWriter + e.printStackTrace(new PrintWriter(sw)) + AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError(sourceId.toString, e.getMessage, sw.toString) + throw e + } + } + + protected def processStandardizationResult(args: Array[String], + standardizedDF: DataFrame, + preparationResult: PreparationResult, + schema: StructType, + cmd: StandardizationConfig, + menasCredentials: MenasCredentials) + (implicit spark: SparkSession, + fsUtils: FileSystemVersionUtils): Unit = { + import za.co.absa.atum.AtumImplicits._ + val fieldRenames = SchemaUtils.getRenamesInSchema(schema) + fieldRenames.foreach { + case (destinationName, sourceName) => standardizedDF.registerColumnRename(sourceName, destinationName) + } + + standardizedDF.setCheckpoint(s"$sourceId - End", persistInDatabase = false) + + val recordCount = standardizedDF.lastCheckpointRowCount match { + case None => standardizedDF.count + case Some(p) => p + } + if (recordCount == 0) { + handleEmptyOutput(sourceId) + } + + standardizedDF.write.parquet(preparationResult.pathCfg.outputPath) + // Store performance metrics + // (record count, directory sizes, elapsed time, etc. to _INFO file metadata and performance file) + val stdDirSize = fsUtils.getDirectorySize(preparationResult.pathCfg.outputPath) + preparationResult.performance.finishMeasurement(stdDirSize, recordCount) + PerformanceMetricTools.addPerformanceMetricsToAtumMetadata( + spark, + "std", + preparationResult.pathCfg.inputPath, + preparationResult.pathCfg.outputPath, + menasCredentials.username, + args.mkString(" ") + ) + + cmd.rowTag.foreach(rowTag => Atum.setAdditionalInfo("xml_row_tag" -> rowTag)) + cmd.csvDelimiter.foreach(delimiter => Atum.setAdditionalInfo("csv_delimiter" -> delimiter)) + + standardizedDF.writeInfoFile(preparationResult.pathCfg.outputPath) + writePerformanceMetrics(preparationResult.performance, cmd) + log.info(s"$sourceId finished successfully") + } + //scalastyle:off parameter.number + + override protected def getPathCfg[T](cmd: JobConfigParser[T], dataset: Dataset, reportVersion: Int): PathConfig = { + val stdCmd = cmd.asInstanceOf[StandardizationParser[T]] + PathConfig( + inputPath = buildRawPath(stdCmd, dataset, reportVersion), + outputPath = getStandardizationPath(cmd, reportVersion) + ) + } + + def buildRawPath[T](cmd: StandardizationParser[T], dataset: Dataset, reportVersion: Int): String = { + val dateTokens = cmd.reportDate.split("-") + cmd.rawPathOverride match { + case None => + val folderSuffix = s"/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v$reportVersion" + cmd.folderPrefix match { + case None => s"${dataset.hdfsPath}$folderSuffix" + case Some(folderPrefix) => s"${dataset.hdfsPath}/$folderPrefix$folderSuffix" + } + case Some(rawPathOverride) => rawPathOverride + } + } + + private def ensureSplittable(df: DataFrame, path: String, schema: StructType) + (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = { + if (fsUtils.isNonSplittable(path)) { + convertToSplittable(df, schema) + } else { + df + } + } + + private def convertToSplittable(df: DataFrame, schema: StructType) + (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = { + log.warn("Dataset is stored in a non-splittable format. This can have a severe performance impact.") + + val tempParquetDir = s"/tmp/nonsplittable-to-parquet-${UUID.randomUUID()}" + log.warn(s"Converting to Parquet in temporary dir: $tempParquetDir") + + // Handle renaming of source columns in case there are columns + // that will break because of issues in column names like spaces + df.select(schema.fields.map { field: StructField => + renameSourceColumn(df, field) + }: _*).write.parquet(tempParquetDir) + + fsUtils.deleteOnExit(tempParquetDir) + // Reload from temp parquet and reverse column renaming above + val dfTmp = spark.read.parquet(tempParquetDir) + dfTmp.select(schema.fields.map { field: StructField => + reverseRenameSourceColumn(dfTmp, field) + }: _*) + } + + private def renameSourceColumn(df: DataFrame, field: StructField): Column = { + if (field.metadata.contains(MetadataKeys.SourceColumn)) { + val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn) + log.info(s"schema field : ${field.name} : rename : $sourceColumnName") + df.col(sourceColumnName).as(field.name, field.metadata) + } else { + df.col(field.name) + } + } + + private def reverseRenameSourceColumn(df: DataFrame, field: StructField): Column = { + if (field.metadata.contains(MetadataKeys.SourceColumn)) { + val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn) + log.info(s"schema field : $sourceColumnName : reverse rename : ${field.name}") + df.col(field.name).as(sourceColumnName) + } else { + df.col(field.name) + } + } + +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala index 02ce0de5b..0fbec4d61 100644 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationJob.scala @@ -15,527 +15,38 @@ package za.co.absa.enceladus.standardization -import java.io.{PrintWriter, StringWriter} -import java.text.MessageFormat -import java.time.Instant -import java.util.UUID - -import com.typesafe.config.ConfigFactory -import org.apache.spark.SPARK_VERSION -import org.apache.spark.sql.types.{StructField, StructType} -import org.apache.spark.sql.{Column, DataFrame, DataFrameReader, SparkSession} -import org.slf4j.LoggerFactory -import za.co.absa.atum.AtumImplicits -import za.co.absa.atum.core.Atum -import za.co.absa.enceladus.common.RecordIdGeneration.{IdType, _} -import za.co.absa.enceladus.common._ -import za.co.absa.enceladus.common.plugin.PostProcessingService -import za.co.absa.enceladus.common.plugin.menas.{MenasPlugin, MenasRunUrl} -import za.co.absa.enceladus.common.version.SparkVersionGuard +import org.apache.spark.sql.SparkSession import za.co.absa.enceladus.dao.MenasDAO -import za.co.absa.enceladus.dao.auth.MenasCredentials -import za.co.absa.enceladus.dao.rest.{MenasConnectionStringParser, RestDaoFactory} -import za.co.absa.enceladus.model.Dataset -import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter -import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator -import za.co.absa.enceladus.utils.config.{ConfigReader, SecureConfig} +import za.co.absa.enceladus.dao.rest.RestDaoFactory +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.utils.fs.FileSystemVersionUtils -import za.co.absa.enceladus.utils.general.ProjectMetadataTools -import za.co.absa.enceladus.utils.performance.{PerformanceMeasurer, PerformanceMetricTools} -import za.co.absa.enceladus.utils.schema.{MetadataKeys, SchemaUtils, SparkUtils} -import za.co.absa.enceladus.utils.time.TimeZoneNormalizer +import za.co.absa.enceladus.utils.modules.SourcePhase import za.co.absa.enceladus.utils.udf.UDFLibrary -import za.co.absa.enceladus.utils.unicode.ParameterConversion._ -import za.co.absa.enceladus.utils.validation.ValidationException - -import scala.collection.immutable.HashMap -import scala.util.control.NonFatal -import scala.util.{Failure, Success, Try} -object StandardizationJob { - TimeZoneNormalizer.normalizeJVMTimeZone() - - private val log = LoggerFactory.getLogger(this.getClass) - private val conf = ConfigFactory.load() - private val confReader: ConfigReader = new ConfigReader(conf) - private val menasBaseUrls = MenasConnectionStringParser.parse(conf.getString("menas.rest.uri")) - private final val SparkCSVReaderMaxColumnsDefault: Int = 20480 +object StandardizationJob extends StandardizationExecution { def main(args: Array[String]) { - // This should be the first thing the app does to make secure Kafka work with our CA. - // After Spring activates JavaX, it will be too late. - SecureConfig.setSecureKafkaProperties(conf) - - SparkVersionGuard.fromDefaultSparkCompatibilitySettings.ensureSparkVersionCompatibility(SPARK_VERSION) + initialValidation() - confReader.logEffectiveConfigProps(Constants.ConfigKeysToRedact) - - implicit val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args) + implicit val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args) implicit val spark: SparkSession = obtainSparkSession() implicit val fsUtils: FileSystemVersionUtils = new FileSystemVersionUtils(spark.sparkContext.hadoopConfiguration) implicit val udfLib: UDFLibrary = new UDFLibrary val menasCredentials = cmd.menasCredentialsFactory.getInstance() implicit val dao: MenasDAO = RestDaoFactory.getInstance(menasCredentials, menasBaseUrls) - dao.authenticate() - - val dataset = dao.getDataset(cmd.datasetName, cmd.datasetVersion) - val schema: StructType = dao.getSchema(dataset.schemaName, dataset.schemaVersion) - val reportVersion = getReportVersion(cmd, dataset) - val pathCfg = getPathCfg(cmd, dataset, reportVersion) - val recordIdGenerationStrategy = getRecordIdGenerationStrategyFromConfig(conf) - - log.info(s"input path: ${pathCfg.inputPath}") - log.info(s"output path: ${pathCfg.outputPath}") - // die if the output path exists - if (fsUtils.hdfsExists(pathCfg.outputPath)) { - throw new IllegalStateException( - s"Path ${pathCfg.outputPath} already exists. Increment the run version, or delete ${pathCfg.outputPath}" - ) - } - - // Enable Spline - import za.co.absa.spline.core.SparkLineageInitializer._ - spark.enableLineageTracking() - - // Enable Control Framework - enableControlFramework(pathCfg, cmd, reportVersion) - - // init performance measurer - val performance = new PerformanceMeasurer(spark.sparkContext.appName) - val dfAll: DataFrame = prepareDataFrame(schema, cmd, pathCfg.inputPath, dataset) - - try { - executeStandardization(performance, dfAll, schema, cmd, menasCredentials, pathCfg, recordIdGenerationStrategy) - cmd.performanceMetricsFile.foreach(this.writePerformanceMetrics(performance, _)) - log.info("Standardization finished successfully") - - // read written data from parquet directly - val standardizedDf = spark.read.parquet(pathCfg.outputPath) - val postProcessingService = getPostProcessingService(cmd, pathCfg, dataset, MenasPlugin.runNumber, Atum.getControlMeasure.runUniqueId) - postProcessingService.onSaveOutput(standardizedDf) // all enabled postProcessors will be run with the std df - } finally { - postStandardizationSteps(cmd) - } - } - - private def getPostProcessingService(cmd: StdCmdConfig, pathCfg: PathCfg, dataset: Dataset, - runNumber: Option[Int], uniqueRunId: Option[String] - )(implicit fsUtils: FileSystemVersionUtils): PostProcessingService = { - val runId = MenasPlugin.runNumber - - if (runId.isEmpty) { - log.warn("No run number found, the Run URL cannot be properly reported!") - } - - // reporting the UI url(s) - if more than one, its comma-separated - val runUrl: Option[String] = runId.map { runNumber => - menasBaseUrls.map { menasBaseUrl => - MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, dataset.name, dataset.version, runNumber) - }.mkString(",") - } - - PostProcessingService.forStandardization(conf, dataset.name, dataset.version, cmd.reportDate, - getReportVersion(cmd, dataset), pathCfg.outputPath, Atum.getControlMeasure.metadata.sourceApplication, runUrl, - runId, uniqueRunId, Instant.now) - } - - private def getReportVersion(cmd: StdCmdConfig, dataset: Dataset)(implicit fsUtils: FileSystemVersionUtils): Int = { - cmd.reportVersion match { - case Some(version) => version - case None => - val newVersion = fsUtils.getLatestVersion(dataset.hdfsPublishPath, cmd.reportDate) + 1 - log.warn(s"Report version not provided, inferred report version: $newVersion") - log.warn("This is an EXPERIMENTAL feature.") - log.warn(" -> It can lead to issues when running multiple jobs on a dataset concurrently.") - log.warn(" -> It may not work as desired when there are gaps in the versions of the data being landed.") - newVersion - } - } - - private def getPathCfg(cmd: StdCmdConfig, dataset: Dataset, reportVersion: Int): PathCfg = { - val dateTokens = cmd.reportDate.split("-") - PathCfg( - inputPath = buildRawPath(cmd, dataset, dateTokens, reportVersion), - outputPath = MessageFormat.format(conf.getString("standardized.hdfs.path"), - cmd.datasetName, - cmd.datasetVersion.toString, - cmd.reportDate, - reportVersion.toString) - ) - } - - private def obtainSparkSession()(implicit cmd: StdCmdConfig): SparkSession = { - val enceladusVersion = ProjectMetadataTools.getEnceladusVersion - log.info(s"Enceladus version $enceladusVersion") - val reportVersion = cmd.reportVersion.map(_.toString).getOrElse("") - val spark = SparkSession.builder() - .appName(s"Standardisation $enceladusVersion ${cmd.datasetName} ${cmd.datasetVersion} ${cmd.reportDate} $reportVersion") - .getOrCreate() - TimeZoneNormalizer.normalizeSessionTimeZone(spark) - spark - } - - /** - * Returns a Spark reader with all format-specific options applied. - * Options are provided by command line parameters. - * - * @param cmd Command line parameters containing format-specific options - * @param dataset A dataset definition - * @param numberOfColumns (Optional) number of columns, enables reading CSV files with the number of columns - * larger than Spark default - * @return The updated dataframe reader - */ - def getFormatSpecificReader(cmd: StdCmdConfig, dataset: Dataset, numberOfColumns: Int = 0) - (implicit spark: SparkSession, dao: MenasDAO): DataFrameReader = { - val dfReader = spark.read.format(cmd.rawFormat) - // applying format specific options - val options = getCobolOptions(cmd, dataset) ++ - getGenericOptions(cmd) ++ - getXmlOptions(cmd) ++ - getCsvOptions(cmd, numberOfColumns) ++ - getFixedWidthOptions(cmd) - - // Applying all the options - options.foldLeft(dfReader) { (df, optionPair) => - optionPair match { - case (key, Some(value)) => - value match { - // Handle all .option() overloads - case StringParameter(s) => df.option(key, s) - case BooleanParameter(b) => df.option(key, b) - case LongParameter(l) => df.option(key, l) - case DoubleParameter(d) => df.option(key, d) - } - case (_, None) => df - } - } - } - - private def getGenericOptions(cmd: StdCmdConfig): HashMap[String, Option[RawFormatParameter]] = { - val mode = if (cmd.failOnInputNotPerSchema) { - "FAILFAST" - } else { - "PERMISSIVE" - } - HashMap( - "charset" -> cmd.charset.map(StringParameter), - "mode" -> Option(StringParameter(mode)) - ) - } - - private def getXmlOptions(cmd: StdCmdConfig): HashMap[String, Option[RawFormatParameter]] = { - if (cmd.rawFormat.equalsIgnoreCase("xml")) { - HashMap("rowtag" -> cmd.rowTag.map(StringParameter)) - } else { - HashMap() - } - } - - private def getCsvOptions(cmd: StdCmdConfig, numberOfColumns: Int = 0): HashMap[String, Option[RawFormatParameter]] = { - if (cmd.rawFormat.equalsIgnoreCase("csv")) { - HashMap( - "delimiter" -> cmd.csvDelimiter.map(s => StringParameter(s.includingUnicode.includingNone)), - "header" -> cmd.csvHeader.map(BooleanParameter), - "quote" -> cmd.csvQuote.map(s => StringParameter(s.includingUnicode.includingNone)), - "escape" -> cmd.csvEscape.map(s => StringParameter(s.includingUnicode.includingNone)), - // increase the default limit on the number of columns if needed - // default is set at org.apache.spark.sql.execution.datasources.csv.CSVOptions maxColumns - "maxColumns" -> {if (numberOfColumns > SparkCSVReaderMaxColumnsDefault) Some(LongParameter(numberOfColumns)) else None} - ) - } else { - HashMap() - } - } - - private def getFixedWidthOptions(cmd: StdCmdConfig): HashMap[String, Option[RawFormatParameter]] = { - if (cmd.rawFormat.equalsIgnoreCase("fixed-width")) { - HashMap("trimValues" -> cmd.fixedWidthTrimValues.map(BooleanParameter)) - } else { - HashMap() - } - } - - private def getCobolOptions(cmd: StdCmdConfig, dataset: Dataset)(implicit dao: MenasDAO): HashMap[String, Option[RawFormatParameter]] = { - if (cmd.rawFormat.equalsIgnoreCase("cobol")) { - val cobolOptions = cmd.cobolOptions.getOrElse(CobolOptions()) - val isXcomOpt = if (cobolOptions.isXcom) Some(true) else None - val isTextOpt = if (cobolOptions.isText) Some(true) else None - val isAscii = cobolOptions.encoding.exists(_.equalsIgnoreCase("ascii")) - // For ASCII files --charset is converted into Cobrix "ascii_charset" option - // For EBCDIC files --charset is converted into Cobrix "ebcdic_code_page" option - HashMap( - getCopybookOption(cobolOptions, dataset), - "is_xcom" -> isXcomOpt.map(BooleanParameter), - "is_text" -> isTextOpt.map(BooleanParameter), - "string_trimming_policy" -> cobolOptions.trimmingPolicy.map(StringParameter), - "encoding" -> cobolOptions.encoding.map(StringParameter), - "ascii_charset" -> cmd.charset.flatMap(charset => if (isAscii) Option(StringParameter(charset)) else None), - "ebcdic_code_page" -> cmd.charset.flatMap(charset => if (!isAscii) Option(StringParameter(charset)) else None), - "schema_retention_policy" -> Some(StringParameter("collapse_root")) - ) - } else { - HashMap() - } - } - - private def getCopybookOption(opts: CobolOptions, dataset: Dataset)(implicit dao: MenasDAO): (String, Option[RawFormatParameter]) = { - val copybook = opts.copybook - if (copybook.isEmpty) { - log.info("Copybook location is not provided via command line - fetching the copybook attached to the schema...") - val copybookContents = dao.getSchemaAttachment(dataset.schemaName, dataset.schemaVersion) - log.info(s"Applying the following copybook:\n$copybookContents") - ("copybook_contents", Option(StringParameter(copybookContents))) - } else { - log.info(s"Use copybook at $copybook") - ("copybook", Option(StringParameter(copybook))) - } - } - - private def prepareDataFrame(schema: StructType, - cmd: StdCmdConfig, - path: String, - dataset: Dataset) - (implicit spark: SparkSession, - fsUtils: FileSystemVersionUtils, - dao: MenasDAO): DataFrame = { - val numberOfColumns = schema.fields.length - val dfReaderConfigured = getFormatSpecificReader(cmd, dataset, numberOfColumns) - - val readerWithOptSchema = cmd.rawFormat.toLowerCase() match { - case "parquet" | "cobol" => - dfReaderConfigured - case _ => - val optColumnNameOfCorruptRecord = getColumnNameOfCorruptRecord(schema, cmd) - val inputSchema = PlainSchemaGenerator.generateInputSchema(schema, optColumnNameOfCorruptRecord) - dfReaderConfigured.schema(inputSchema) - } - - val dfWithSchema = readerWithOptSchema.load(s"$path/*") - ensureSplittable(dfWithSchema, path, schema) - } - - private def getColumnNameOfCorruptRecord(schema: StructType, cmd: StdCmdConfig) - (implicit spark: SparkSession): Option[String] = { - // SparkUtils.setUniqueColumnNameOfCorruptRecord is called even if result is not used to avoid conflict - val columnNameOfCorruptRecord = SparkUtils.setUniqueColumnNameOfCorruptRecord(spark, schema) - if (cmd.rawFormat.equalsIgnoreCase("fixed-width") || cmd.failOnInputNotPerSchema) { - None - } else { - Option(columnNameOfCorruptRecord) - } - } - - //scalastyle:off parameter.number - private def executeStandardization(performance: PerformanceMeasurer, - dfAll: DataFrame, - schema: StructType, - cmd: StdCmdConfig, - menasCredentials: MenasCredentials, - pathCfg: PathCfg, - recordIdGenerationStrategy: IdType) - (implicit spark: SparkSession, udfLib: UDFLibrary, fsUtils: FileSystemVersionUtils): Unit = { - //scalastyle:on parameter.number - val rawDirSize: Long = fsUtils.getDirectorySize(pathCfg.inputPath) - performance.startMeasurement(rawDirSize) - - handleControlInfoValidation() - - PerformanceMetricTools.addJobInfoToAtumMetadata("std", pathCfg.inputPath, pathCfg.outputPath, - menasCredentials.username, cmd.cmdLineArgs.mkString(" ")) - val standardizedDF = try { - StandardizationInterpreter.standardize(dfAll, schema, cmd.rawFormat, cmd.failOnInputNotPerSchema, recordIdGenerationStrategy) - } catch { - case e@ValidationException(msg, errors) => - AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Schema Validation", s"$msg\nDetails: ${ - errors.mkString("\n") - }", "") - throw e - case NonFatal(e) if !e.isInstanceOf[ValidationException] => - val sw = new StringWriter - e.printStackTrace(new PrintWriter(sw)) - AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Standardization", e.getMessage, sw.toString) - throw e - } - - //register renames with ATUM - import za.co.absa.atum.AtumImplicits._ - val fieldRenames = SchemaUtils.getRenamesInSchema(schema) - fieldRenames.foreach { - case (destinationName, sourceName) => standardizedDF.registerColumnRename(sourceName, destinationName) - } - - standardizedDF.setCheckpoint("Standardization - End", persistInDatabase = false) - - val recordCount = standardizedDF.lastCheckpointRowCount match { - case None => standardizedDF.count - case Some(p) => p - } - if (recordCount == 0) { handleEmptyOutputAfterStandardization() } - - standardizedDF.write.parquet(pathCfg.outputPath) - // Store performance metrics - // (record count, directory sizes, elapsed time, etc. to _INFO file metadata and performance file) - val stdDirSize = fsUtils.getDirectorySize(pathCfg.outputPath) - performance.finishMeasurement(stdDirSize, recordCount) - cmd.rowTag.foreach(rowTag => Atum.setAdditionalInfo("xml_row_tag" -> rowTag)) - if (cmd.csvDelimiter.isDefined) { - cmd.csvDelimiter.foreach(delimiter => Atum.setAdditionalInfo("csv_delimiter" -> delimiter)) - } - PerformanceMetricTools.addPerformanceMetricsToAtumMetadata(spark, "std", pathCfg.inputPath, pathCfg.outputPath, - menasCredentials.username, cmd.cmdLineArgs.mkString(" ")) - standardizedDF.writeInfoFile(pathCfg.outputPath) - } - - private def handleControlInfoValidation(): Unit = { - ControlInfoValidation.addRawAndSourceRecordCountsToMetadata() match { - case Failure(ex: za.co.absa.enceladus.utils.validation.ValidationException) => { - val confEntry = "control.info.validation" - conf.getString(confEntry) match { - case "strict" => throw ex - case "warning" => log.warn(ex.msg) - case "none" => - case _ => throw new RuntimeException(s"Invalid $confEntry value") - } - } - case Failure(ex) => throw ex - case Success(_) => - } - } + val preparationResult = prepareJob() + val schema = prepareStandardization(args, menasCredentials, preparationResult) + val inputData = readStandardizationInputData(schema, cmd, preparationResult.pathCfg.inputPath, preparationResult.dataset) - private def handleEmptyOutputAfterStandardization()(implicit spark: SparkSession): Unit = { - import za.co.absa.atum.core.Constants._ - - val areCountMeasurementsAllZero = Atum.getControlMeasure.checkpoints - .flatMap(checkpoint => - checkpoint.controls.filter(control => - control.controlName.equalsIgnoreCase(controlTypeRecordCount))) - .forall(m => Try(m.controlValue.toString.toDouble).toOption.contains(0D)) - - if (areCountMeasurementsAllZero) { - log.warn("Empty output after running Standardization. Previous checkpoints show this is correct.") - } else { - val errMsg = "Empty output after running Standardization, while previous checkpoints show non zero record count" - AtumImplicits.SparkSessionWrapper(spark).setControlMeasurementError("Standardization", errMsg, "") - throw new IllegalStateException(errMsg) - } - } - - private def ensureSplittable(df: DataFrame, path: String, schema: StructType) - (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = { - if (fsUtils.isNonSplittable(path)) { - convertToSplittable(df, path, schema) - } else { - df - } - } - - private def convertToSplittable(df: DataFrame, path: String, schema: StructType) - (implicit spark: SparkSession, fsUtils: FileSystemVersionUtils) = { - log.warn("Dataset is stored in a non-splittable format. This can have a severe performance impact.") - - val tempParquetDir = s"/tmp/nonsplittable-to-parquet-${UUID.randomUUID()}" - log.warn(s"Converting to Parquet in temporary dir: $tempParquetDir") - - // Handle renaming of source columns in case there are columns - // that will break because of issues in column names like spaces - df.select(schema.fields.map { field: StructField => - renameSourceColumn(df, field) - }: _*).write.parquet(tempParquetDir) - - fsUtils.deleteOnExit(tempParquetDir) - // Reload from temp parquet and reverse column renaming above - val dfTmp = spark.read.parquet(tempParquetDir) - dfTmp.select(schema.fields.map { field: StructField => - reverseRenameSourceColumn(dfTmp, field) - }: _*) - } - - private def renameSourceColumn(df: DataFrame, field: StructField): Column = { - if (field.metadata.contains(MetadataKeys.SourceColumn)) { - val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn) - log.info(s"schema field : ${field.name} : rename : $sourceColumnName") - df.col(sourceColumnName).as(field.name, field.metadata) - } else { - df.col(field.name) - } - } - - private def reverseRenameSourceColumn(df: DataFrame, field: StructField): Column = { - if (field.metadata.contains(MetadataKeys.SourceColumn)) { - val sourceColumnName = field.metadata.getString(MetadataKeys.SourceColumn) - log.info(s"schema field : $sourceColumnName : reverse rename : ${field.name}") - df.col(field.name).as(sourceColumnName) - } else { - df.col(field.name) - } - } - - private def enableControlFramework(pathCfg: PathCfg, cmd: StdCmdConfig, reportVersion: Int) - (implicit spark: SparkSession, dao: MenasDAO): Unit = { - // Enable Control Framework - import za.co.absa.atum.AtumImplicits.SparkSessionWrapper - spark.enableControlMeasuresTracking(s"${pathCfg.inputPath}/_INFO").setControlMeasuresWorkflow("Standardization") - - // Enable control framework performance optimization for pipeline-like jobs - Atum.setAllowUnpersistOldDatasets(true) - - // Enable non-default persistence storage level if provided in the command line - cmd.persistStorageLevel.foreach(Atum.setCachingStorageLevel) - - // Enable Menas plugin for Control Framework - MenasPlugin.enableMenas( - conf, - cmd.datasetName, - cmd.datasetVersion, - cmd.reportDate, - reportVersion, - isJobStageOnly = true, - generateNewRun = true) - - // Add report date and version (aka Enceladus info date and version) to Atum's metadata - Atum.setAdditionalInfo(Constants.InfoDateColumn -> cmd.reportDate) - Atum.setAdditionalInfo(Constants.InfoVersionColumn -> reportVersion.toString) - - // Add the raw format of the input file(s) to Atum's metadta as well - Atum.setAdditionalInfo("raw_format" -> cmd.rawFormat) - } - - private def writePerformanceMetrics(performance: PerformanceMeasurer, fileName: String): Unit = { try { - performance.writeMetricsToFile(fileName) - } catch { - case NonFatal(e) => log.error(s"Unable to write performance metrics to file '$fileName': ${e.getMessage}") - } - } + val result = standardize(inputData, schema, cmd) - private def postStandardizationSteps(cmd: StdCmdConfig): Unit = { - Atum.getControlMeasure.runUniqueId + processStandardizationResult(args, result, preparationResult, schema, cmd, menasCredentials) - val name = cmd.datasetName - val version = cmd.datasetVersion - MenasPlugin.runNumber.foreach { runNumber => - menasBaseUrls.foreach { menasBaseUrl => - val apiUrl = MenasRunUrl.getMenasApiRunUrl(menasBaseUrl, name, version, runNumber) - val uiUrl = MenasRunUrl.getMenasUiRunUrl(menasBaseUrl, name, version, runNumber) - - log.info(s"Menas API Run URL: $apiUrl") - log.info(s"Menas UI Run URL: $uiUrl") - } - } - } - - def buildRawPath(cmd: StdCmdConfig, dataset: Dataset, dateTokens: Array[String], reportVersion: Int): String = { - cmd.rawPathOverride match { - case None => - val folderSuffix = s"/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v$reportVersion" - cmd.folderPrefix match { - case None => s"${dataset.hdfsPath}$folderSuffix" - case Some(folderPrefix) => s"${dataset.hdfsPath}/$folderPrefix$folderSuffix" - } - case Some(rawPathOverride) => rawPathOverride + runPostProcessing(SourcePhase.Standardization, preparationResult, cmd) + } finally { + finishJob(cmd) } } - - private final case class PathCfg(inputPath: String, outputPath: String) } diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationPropertiesProvider.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationPropertiesProvider.scala new file mode 100644 index 000000000..832cfda1d --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StandardizationPropertiesProvider.scala @@ -0,0 +1,153 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.standardization + +import org.apache.spark.sql.{DataFrameReader, SparkSession} +import org.slf4j.{Logger, LoggerFactory} +import za.co.absa.enceladus.common._ +import za.co.absa.enceladus.dao.MenasDAO +import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig +import za.co.absa.enceladus.utils.unicode.ParameterConversion._ + +import scala.collection.immutable.HashMap + +/** + * Reads standardization properties from the configuration file + */ +class StandardizationPropertiesProvider { + private val log: Logger = LoggerFactory.getLogger(this.getClass) + private final val SparkCSVReaderMaxColumnsDefault: Int = 20480 + + /** + * Returns a Spark reader with all format-specific options applied. + * Options are provided by command line parameters. + * + * @param cmd Command line parameters containing format-specific options + * @param dataset A dataset definition + * @param numberOfColumns (Optional) number of columns, enables reading CSV files with the number of columns + * larger than Spark default + * @return The updated dataframe reader + */ + def getFormatSpecificReader(cmd: StandardizationConfig, dataset: Dataset, numberOfColumns: Int = 0) + (implicit spark: SparkSession, dao: MenasDAO): DataFrameReader = { + val dfReader = spark.read.format(cmd.rawFormat) + // applying format specific options + val options = getCobolOptions(cmd, dataset) ++ + getGenericOptions(cmd) ++ + getXmlOptions(cmd) ++ + getCsvOptions(cmd, numberOfColumns) ++ + getFixedWidthOptions(cmd) + + // Applying all the options + options.foldLeft(dfReader) { (df, optionPair) => + optionPair match { + case (key, Some(value)) => + value match { + // Handle all .option() overloads + case StringParameter(s) => df.option(key, s) + case BooleanParameter(b) => df.option(key, b) + case LongParameter(l) => df.option(key, l) + case DoubleParameter(d) => df.option(key, d) + } + case (_, None) => df + } + } + } + + private def getGenericOptions(cmd: StandardizationConfig): HashMap[String, Option[RawFormatParameter]] = { + val mode = if (cmd.failOnInputNotPerSchema) { + "FAILFAST" + } else { + "PERMISSIVE" + } + HashMap( + "charset" -> cmd.charset.map(StringParameter), + "mode" -> Option(StringParameter(mode)) + ) + } + + private def getXmlOptions(cmd: StandardizationConfig): HashMap[String, Option[RawFormatParameter]] = { + if (cmd.rawFormat.equalsIgnoreCase("xml")) { + HashMap("rowtag" -> cmd.rowTag.map(StringParameter)) + } else { + HashMap() + } + } + + private def getCsvOptions(cmd: StandardizationConfig, numberOfColumns: Int = 0): HashMap[String, Option[RawFormatParameter]] = { + if (cmd.rawFormat.equalsIgnoreCase("csv")) { + HashMap( + "delimiter" -> cmd.csvDelimiter.map(s => StringParameter(s.includingUnicode.includingNone)), + "header" -> cmd.csvHeader.map(BooleanParameter), + "quote" -> cmd.csvQuote.map(s => StringParameter(s.includingUnicode.includingNone)), + "escape" -> cmd.csvEscape.map(s => StringParameter(s.includingUnicode.includingNone)), + // increase the default limit on the number of columns if needed + // default is set at org.apache.spark.sql.execution.datasources.csv.CSVOptions maxColumns + "maxColumns" -> { + if (numberOfColumns > SparkCSVReaderMaxColumnsDefault) Some(LongParameter(numberOfColumns)) else None + } + ) + } else { + HashMap() + } + } + + private def getFixedWidthOptions(cmd: StandardizationConfig): HashMap[String, Option[RawFormatParameter]] = { + if (cmd.rawFormat.equalsIgnoreCase("fixed-width")) { + HashMap("trimValues" -> cmd.fixedWidthTrimValues.map(BooleanParameter)) + } else { + HashMap() + } + } + + private def getCobolOptions(cmd: StandardizationConfig, dataset: Dataset)(implicit dao: MenasDAO): HashMap[String, Option[RawFormatParameter]] = { + if (cmd.rawFormat.equalsIgnoreCase("cobol")) { + val cobolOptions = cmd.cobolOptions.getOrElse(CobolOptions()) + val isXcomOpt = if (cobolOptions.isXcom) Some(true) else None + val isTextOpt = if (cobolOptions.isText) Some(true) else None + val isAscii = cobolOptions.encoding.exists(_.equalsIgnoreCase("ascii")) + // For ASCII files --charset is converted into Cobrix "ascii_charset" option + // For EBCDIC files --charset is converted into Cobrix "ebcdic_code_page" option + HashMap( + getCopybookOption(cobolOptions, dataset), + "is_xcom" -> isXcomOpt.map(BooleanParameter), + "is_text" -> isTextOpt.map(BooleanParameter), + "string_trimming_policy" -> cobolOptions.trimmingPolicy.map(StringParameter), + "encoding" -> cobolOptions.encoding.map(StringParameter), + "ascii_charset" -> cmd.charset.flatMap(charset => if (isAscii) Option(StringParameter(charset)) else None), + "ebcdic_code_page" -> cmd.charset.flatMap(charset => if (!isAscii) Option(StringParameter(charset)) else None), + "schema_retention_policy" -> Some(StringParameter("collapse_root")) + ) + } else { + HashMap() + } + } + + private def getCopybookOption(opts: CobolOptions, dataset: Dataset)(implicit dao: MenasDAO): (String, Option[RawFormatParameter]) = { + val copybook = opts.copybook + if (copybook.isEmpty) { + log.info("Copybook location is not provided via command line - fetching the copybook attached to the schema...") + val copybookContents = dao.getSchemaAttachment(dataset.schemaName, dataset.schemaVersion) + log.info(s"Applying the following copybook:\n$copybookContents") + ("copybook_contents", Option(StringParameter(copybookContents))) + } else { + log.info(s"Use copybook at $copybook") + ("copybook", Option(StringParameter(copybook))) + } + } + +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StdCmdConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StdCmdConfig.scala deleted file mode 100644 index ce7dabd34..000000000 --- a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/StdCmdConfig.scala +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright 2018 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.enceladus.standardization - -import org.apache.spark.storage.StorageLevel -import scopt.OptionParser -import za.co.absa.enceladus.dao.auth._ - -import scala.util.matching.Regex - -/** - * This is a class for configuration provided by the command line parameters - * - * Note: scopt requires all fields to have default values. - * Even if a field is mandatory it needs a default value. - */ -case class StdCmdConfig( - cmdLineArgs: Array[String], - datasetName: String = "", - datasetVersion: Int = 1, - reportDate: String = "", - reportVersion: Option[Int] = None, - rawFormat: String = "xml", - menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory, - charset: Option[String] = None, - rowTag: Option[String] = None, - csvDelimiter: Option[String] = None, - csvHeader: Option[Boolean] = Some(false), - csvQuote: Option[String] = None, - csvEscape: Option[String] = None, - cobolOptions: Option[CobolOptions] = None, - fixedWidthTrimValues: Option[Boolean] = Some(false), - performanceMetricsFile: Option[String] = None, - rawPathOverride: Option[String] = None, - folderPrefix: Option[String] = None, - persistStorageLevel: Option[StorageLevel] = None, - failOnInputNotPerSchema: Boolean = false - ) - -object StdCmdConfig { - - def getCmdLineArguments(args: Array[String]): StdCmdConfig = { - val parser = new CmdParser("spark-submit [spark options] StandardizationBundle.jar") - - val optionCmd = parser.parse(args, StdCmdConfig(args)) - if (optionCmd.isEmpty) { - // Wrong arguments provided, the message is already displayed - System.exit(1) - } - optionCmd.get - } - - private class CmdParser(programName: String) extends OptionParser[StdCmdConfig](programName) { - head("\nStandardization", "") - var rawFormat: Option[String] = None - - opt[String]('D', "dataset-name").required().action((value, config) => - config.copy(datasetName = value)).text("Dataset name") - - opt[Int]('d', "dataset-version").required().action((value, config) => - config.copy(datasetVersion = value)).text("Dataset version") - .validate(value => - if (value > 0) { - success - } else { - failure("Option --dataset-version must be >0") - }) - - val reportDateMatcher: Regex = "^\\d{4}-\\d{2}-\\d{2}$".r - opt[String]('R', "report-date").required().action((value, config) => - config.copy(reportDate = value)).text("Report date in 'yyyy-MM-dd' format") - .validate(value => - reportDateMatcher.findFirstIn(value) match { - case None => failure(s"Match error in '$value'. Option --report-date expects a date in 'yyyy-MM-dd' format") - case _ => success - }) - - private var credsFile: Option[String] = None - private var keytabFile: Option[String] = None - opt[String]("menas-credentials-file").hidden.optional().action({ (file, config) => - credsFile = Some(file) - config.copy(menasCredentialsFactory = new MenasPlainCredentialsFactory(file)) - }).text("Path to Menas credentials config file.").validate(path => - if (keytabFile.isDefined) { - failure("Only one authentication method is allow at a time") - } else { - success - }) - - opt[String]("menas-auth-keytab").optional().action({ (file, config) => - keytabFile = Some(file) - config.copy(menasCredentialsFactory = new MenasKerberosCredentialsFactory(file)) - }).text("Path to keytab file used for authenticating to menas").validate({ file => - if (credsFile.isDefined) { - failure("Only one authentication method is allowed at a time") - } else { - success - } - }) - - opt[Int]('r', "report-version").optional().action((value, config) => - config.copy(reportVersion = Some(value))) - .text("Report version. If not provided, it is inferred based on the publish path (it's an EXPERIMENTAL feature)") - .validate(value => - if (value > 0) { - success - } else { - failure("Option --report-version must be >0") - }) - - opt[String]('f', "raw-format").required().action((value, config) => { - rawFormat = Some(value) - config.copy(rawFormat = value) - }).text("format of the raw data (csv, xml, parquet,fixed-width, etc.)") - - opt[String]("charset").optional().action((value, config) => - config.copy(charset = Some(value))).text("use the specific charset (default is UTF-8)") - .validate(value => - if (rawFormat.isDefined && - (rawFormat.get.equalsIgnoreCase("xml") || - rawFormat.get.equalsIgnoreCase("csv") || - rawFormat.get.equalsIgnoreCase("json") || - rawFormat.get.equalsIgnoreCase("cobol"))) { - success - } else { - failure("The --charset option is supported only for CSV, JSON, XML and COBOL") - }) - - opt[String]("row-tag").optional().action((value, config) => - config.copy(rowTag = Some(value))).text("use the specific row tag instead of 'ROW' for XML format") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("xml")) { - success - } else { - failure("The --row-tag option is supported only for XML raw data format") - }) - - opt[String]("delimiter").optional().action((value, config) => - config.copy(csvDelimiter = Some(value))).text("use the specific delimiter instead of ',' for CSV format") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) { - success - } else { - failure("The --delimiter option is supported only for CSV raw data format") - }) - - opt[String]("csv-quote").optional().action((value, config) => - config.copy(csvQuote = Some(value))) - .text("use the specific quote character for creating CSV fields that may contain delimiter character(s) (default is '\"')") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) { - success - } else { - failure("The --csv-quote option is supported only for CSV raw data format") - }) - - opt[String]("csv-escape").optional().action((value, config) => - config.copy(csvEscape = Some(value))) - .text("use the specific escape character for CSV fields (default is '\\')") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) { - success - } else { - failure("The --csv-escape option is supported only for CSV raw data format") - }) - - // no need for validation for boolean since scopt itself will do - opt[Boolean]("header").optional().action((value, config) => - config.copy(csvHeader = Some(value))).text("use the header option to consider CSV header") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("csv")) { - success - } else { - failure("The --header option is supported only for CSV ") - }) - - opt[Boolean]("trimValues").optional().action((value, config) => - config.copy(fixedWidthTrimValues = Some(value))).text("use --trimValues option to trim values in fixed width file") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("fixed-width")) { - success - } else { - failure("The --trimValues option is supported only for fixed-width files ") - }) - - opt[Boolean]("strict-schema-check").optional().action((value, config) => - config.copy(failOnInputNotPerSchema = value)) - .text("use --strict-schema-check option to fail or proceed over rows not adhering to the schema (with error in errCol)") - - processCobolCmdOptions() - - opt[String]("performance-file").optional().action((value, config) => - config.copy(performanceMetricsFile = Some(value))).text("produce a performance metrics file at the given location (local filesystem)") - - opt[String]("debug-set-raw-path").optional().hidden().action((value, config) => - config.copy(rawPathOverride = Some(value))).text("override the path of the raw data (used internally for performance tests)") - - opt[String]("folder-prefix").optional().action((value, config) => - config.copy(folderPrefix = Some(value))).text("Adds a folder prefix before the date tokens") - - opt[String]("persist-storage-level").optional().action((value, config) => - config.copy(persistStorageLevel = Some(StorageLevel.fromString(value)))) - .text("Specifies persistence storage level to use when processing data. Spark's default is MEMORY_AND_DISK.") - - help("help").text("prints this usage text") - - checkConfig { config => - config.menasCredentialsFactory match { - case InvalidMenasCredentialsFactory => failure("No authentication method specified (e.g. --menas-auth-keytab)") - case _ => success - } - } - - private def processCobolCmdOptions(): Unit = { - opt[String]("copybook").optional().action((value, config) => { - config.copy(cobolOptions = cobolSetCopybook(config.cobolOptions, value)) - }).text("Path to a copybook for COBOL data format") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) { - success - } else { - failure("The --copybook option is supported only for COBOL data format") - } - ) - - opt[Boolean]("is-xcom").optional().action((value, config) => { - config.copy(cobolOptions = cobolSetIsXcom(config.cobolOptions, value)) - }).text("Does a mainframe file in COBOL format contain XCOM record headers") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) { - success - } else { - failure("The --is-xcom option is supported only for COBOL data format") - }) - - opt[Boolean]("cobol-is-text").optional().action((value, config) => { - config.copy(cobolOptions = cobolSetIsText(config.cobolOptions, value)) - }).text("Specifies if the mainframe file is ASCII text file") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) { - success - } else { - failure("The --cobol-is-text option is supported only for COBOL data format") - }) - - opt[String]("cobol-encoding").optional().action((value, config) => { - config.copy(cobolOptions = cobolSetEncoding(config.cobolOptions, value)) - }).text("Specify encoding of mainframe files (ascii or ebcdic)") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) { - success - } else { - failure("The --cobol-encoding option is supported only for COBOL data format") - }) - - opt[String]("cobol-trimming-policy").optional().action((value, config) => { - config.copy(cobolOptions = cobolSetTrimmingPolicy(config.cobolOptions, value)) - }).text("Specify string trimming policy for mainframe files (none, left, right, both)") - .validate(value => - if (rawFormat.isDefined && rawFormat.get.equalsIgnoreCase("cobol")) { - success - } else { - failure("The --cobol-trimming-policy option is supported only for COBOL data format") - }) - } - - private def cobolSetCopybook(cobolOptions: Option[CobolOptions], newCopybook: String): Option[CobolOptions] = { - cobolOptions match { - case Some(a) => Some(a.copy(copybook = newCopybook)) - case None => Some(CobolOptions(newCopybook)) - } - } - - private def cobolSetIsText(cobolOptions: Option[CobolOptions], newIsText: Boolean): Option[CobolOptions] = { - cobolOptions match { - case Some(a) => Some(a.copy(isText = newIsText)) - case None => Some(CobolOptions(isText = newIsText)) - } - } - - private def cobolSetIsXcom(cobolOptions: Option[CobolOptions], newIsXCom: Boolean): Option[CobolOptions] = { - cobolOptions match { - case Some(a) => Some(a.copy(isXcom = newIsXCom)) - case None => Some(CobolOptions(isXcom = newIsXCom)) - } - } - - private def cobolSetEncoding(cobolOptions: Option[CobolOptions], newEncoding: String): Option[CobolOptions] = { - cobolOptions match { - case Some(a) => Some(a.copy(encoding = Option(newEncoding))) - case None => Some(CobolOptions(encoding = Option(newEncoding))) - } - } - - private def cobolSetTrimmingPolicy(cobolOptions: Option[CobolOptions], newTrimmingPolicy: String): Option[CobolOptions] = { - cobolOptions match { - case Some(a) => Some(a.copy(trimmingPolicy = Option(newTrimmingPolicy))) - case None => Some(CobolOptions(trimmingPolicy = Option(newTrimmingPolicy))) - } - } - } - -} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationConfig.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationConfig.scala new file mode 100644 index 000000000..9bb4bc0a3 --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationConfig.scala @@ -0,0 +1,101 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.standardization.config + +import org.apache.spark.storage.StorageLevel +import scopt.OParser +import za.co.absa.enceladus.common.config.{ConfigError, JobConfigParser} +import za.co.absa.enceladus.dao.auth.{InvalidMenasCredentialsFactory, MenasCredentialsFactory} +import za.co.absa.enceladus.standardization.CobolOptions + +import scala.util.Try + +/** + * This is a class for configuration provided by the command line parameters + * + * Note: scopt requires all fields to have default values. + * Even if a field is mandatory it needs a default value. + */ +case class StandardizationConfig(rawFormat: String = "xml", + charset: Option[String] = None, + rowTag: Option[String] = None, + csvDelimiter: Option[String] = None, + csvHeader: Option[Boolean] = Some(false), + csvQuote: Option[String] = None, + csvEscape: Option[String] = None, + cobolOptions: Option[CobolOptions] = None, + fixedWidthTrimValues: Option[Boolean] = Some(false), + rawPathOverride: Option[String] = None, + failOnInputNotPerSchema: Boolean = false, + datasetName: String = "", + datasetVersion: Int = 1, + reportDate: String = "", + reportVersion: Option[Int] = None, + menasCredentialsFactory: MenasCredentialsFactory = InvalidMenasCredentialsFactory, + performanceMetricsFile: Option[String] = None, + folderPrefix: Option[String] = None, + persistStorageLevel: Option[StorageLevel] = None, + credsFile: Option[String] = None, + keytabFile: Option[String] = None + ) + extends StandardizationParser[StandardizationConfig]{ + override def withRawFormat(value: String): StandardizationConfig = copy(rawFormat = value) + override def withCharset(value: Option[String]): StandardizationConfig = copy(charset = value) + override def withRowTag(value: Option[String]): StandardizationConfig = copy(rowTag = value) + override def withCsvDelimiter(value: Option[String]): StandardizationConfig = copy(csvDelimiter = value) + override def withCsvHeader(value: Option[Boolean]): StandardizationConfig = copy(csvHeader = value) + override def withCsvQuote(value: Option[String]): StandardizationConfig = copy(csvQuote = value) + override def withCsvEscape(value: Option[String]): StandardizationConfig = copy(csvEscape = value) + override def withCobolOptions(value: Option[CobolOptions]): StandardizationConfig = copy(cobolOptions = value) + override def withFixedWidthTrimValues(value: Option[Boolean]): StandardizationConfig = copy(fixedWidthTrimValues = value) + override def withRawPathOverride(value: Option[String]): StandardizationConfig = copy(rawPathOverride = value) + override def withFailOnInputNotPerSchema(value: Boolean): StandardizationConfig = copy(failOnInputNotPerSchema = value) + + override def withDatasetName(value: String): StandardizationConfig = copy(datasetName = value) + override def withDatasetVersion(value: Int): StandardizationConfig = copy(datasetVersion = value) + override def withReportDate(value: String): StandardizationConfig = copy(reportDate = value) + override def withReportVersion(value: Option[Int]): StandardizationConfig = copy(reportVersion = value) + override def withCredsFile(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): StandardizationConfig = { + copy(credsFile = value, menasCredentialsFactory = menasCredentialsFactory) + } + override def withAuthKeytab(value: Option[String], menasCredentialsFactory: MenasCredentialsFactory): StandardizationConfig = { + copy(keytabFile = value, menasCredentialsFactory = menasCredentialsFactory) + } + override def withPerformanceMetricsFile(value: Option[String]): StandardizationConfig = copy(performanceMetricsFile = value) + override def withFolderPrefix(value: Option[String]): StandardizationConfig = copy(folderPrefix = value) + override def withPersistStorageLevel(value: Option[StorageLevel]): StandardizationConfig = copy(persistStorageLevel = value) +} + +object StandardizationConfig { + + def tryFromArguments(args: Array[String]): Try[StandardizationConfig] = { + import za.co.absa.enceladus.utils.implicits.OptionImplicits._ + OParser.parse(standardizationJobParser, args, StandardizationConfig()).toTry(ConfigError("Command line parameters error")) + } + + def getFromArguments(args: Array[String]): StandardizationConfig = tryFromArguments(args).get + + private val standardizationJobParser: OParser[_, StandardizationConfig] = { + val builder = OParser.builder[StandardizationConfig] + import builder._ + OParser.sequence( + programName("Standardization Job"), + head("Standardization", ""), + StandardizationParser.standardizationParser, + JobConfigParser.jobConfigParser + ) + } +} diff --git a/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationParser.scala b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationParser.scala new file mode 100644 index 000000000..8b1da3bd8 --- /dev/null +++ b/spark-jobs/src/main/scala/za/co/absa/enceladus/standardization/config/StandardizationParser.scala @@ -0,0 +1,215 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.standardization.config + +import scopt.{OParser, OParserBuilder} +import za.co.absa.enceladus.common.config.JobConfigParser +import za.co.absa.enceladus.standardization.CobolOptions + +trait StandardizationParser[R] extends JobConfigParser[R] { + def withRawFormat(value: String): R + def withCharset(value: Option[String] = None): R + def withRowTag(value: Option[String] = None): R + def withCsvDelimiter(value: Option[String] = None): R + def withCsvHeader(value: Option[Boolean] = Some(false)): R + def withCsvQuote(value: Option[String] = None): R + def withCsvEscape(value: Option[String] = None): R + def withCobolOptions(value: Option[CobolOptions] = None): R + def withFixedWidthTrimValues(value: Option[Boolean] = Some(false)): R + def withRawPathOverride(value: Option[String]): R + def withFailOnInputNotPerSchema(value: Boolean): R + + def rawFormat: String + def charset: Option[String] + def rowTag: Option[String] + def csvDelimiter: Option[String] + def csvHeader: Option[Boolean] + def csvQuote: Option[String] + def csvEscape: Option[String] + def cobolOptions: Option[CobolOptions] + def fixedWidthTrimValues: Option[Boolean] + def rawPathOverride: Option[String] + def failOnInputNotPerSchema: Boolean +} + +object StandardizationParser { + + //scalastyle:off method.length the length is legit for parsing input paramters + def standardizationParser[R <: StandardizationParser[R]]: OParser[_, R] = { + val builder = OParser.builder[R] + import builder._ + OParser.sequence( + head("\nStandardization", ""), + + opt[String]('f', "raw-format").required().action((value, config) => { + config.withRawFormat(value.toLowerCase()) + }).text("format of the raw data (csv, xml, parquet, fixed-width, etc.)"), + + opt[String]("charset").optional().action((value, config) => + config.withCharset(Some(value))).text("use the specific charset (default is UTF-8)"), + + opt[String]("row-tag").optional().action((value, config) => + config.withRowTag(Some(value))).text("use the specific row tag instead of 'ROW' for XML format"), + + opt[String]("delimiter").optional().action((value, config) => + config.withCsvDelimiter(Some(value))).text("use the specific delimiter instead of ',' for CSV format"), + + opt[String]("csv-quote").optional().action((value, config) => + config.withCsvQuote(Some(value))) + .text("use the specific quote character for creating CSV fields that may contain delimiter character(s) (default is '\"')"), + + opt[String]("csv-escape").optional().action((value, config) => + config.withCsvEscape(Some(value))) + .text("use the specific escape character for CSV fields (default is '\\')"), + + // no need for validation for boolean since scopt itself will do + opt[Boolean]("header").optional().action((value, config) => + config.withCsvHeader(Some(value))).text("use the header option to consider CSV header"), + + opt[Boolean]("trimValues").optional().action((value, config) => + config.withFixedWidthTrimValues(Some(value))).text("use --trimValues option to trim values in fixed width file"), + + opt[Boolean]("strict-schema-check").optional().action((value, config) => + config.withFailOnInputNotPerSchema(value)) + .text("use --strict-schema-check option to fail or proceed over rows not adhering to the schema (with error in errCol)"), + + opt[String]("copybook").optional().action((value, config) => { + val newOptions = config.cobolOptions match { + case Some(a) => Some(a.copy(copybook = value)) + case None => Some(CobolOptions(value)) + } + config.withCobolOptions(newOptions) + + }).text("Path to a copybook for COBOL data format"), + + opt[Boolean]("is-xcom").optional().action((value, config) => { + val newOptions = config.cobolOptions match { + case Some(a) => Some(a.copy(isXcom = value)) + case None => Some(CobolOptions(isXcom = value)) + } + config.withCobolOptions(newOptions) + }).text("Does a mainframe file in COBOL format contain XCOM record headers"), + + opt[Boolean]("cobol-is-text").optional().action((value, config) => { + val newOptions = config.cobolOptions match { + case Some(a) => Some(a.copy(isText = value)) + case None => Some(CobolOptions(isText = value)) + } + config.withCobolOptions(newOptions) + }).text("Specifies if the mainframe file is ASCII text file"), + + opt[String]("cobol-encoding").optional().action((value, config) => { + val newOptions = config.cobolOptions match { + case Some(a) => Some(a.copy(encoding = Option(value))) + case None => Some(CobolOptions(encoding = Option(value))) + } + config.withCobolOptions(newOptions) + }).text("Specify encoding of mainframe files (ascii or ebcdic)"), + + opt[String]("cobol-trimming-policy").optional().action((value, config) => { + val newOptions = config.cobolOptions match { + case Some(a) => Some(a.copy(trimmingPolicy = Option(value))) + case None => Some(CobolOptions(trimmingPolicy = Option(value))) + } + config.withCobolOptions(newOptions) + }).text("Specify string trimming policy for mainframe files (none, left, right, both)"), + + opt[String]("debug-set-raw-path").optional().hidden().action((value, config) => + config.withRawPathOverride(Some(value))) + .text("override the path of the raw data (used internally for performance tests)"), + + checkConfig(checkConfigX(_, builder)) + ) + } + //scalastyle:on method.length + + private val formatsSupportingCharset = List("xml", "csv", "json", "cobol") + + private def typicalError(field: String, format: String): String = { + s"The $field option is supported only for $format format" + } + + private def checkCharset[R <: StandardizationParser[R]](config: R): List[String] = { + if (!formatsSupportingCharset.contains(config.rawFormat) && config.charset.isDefined) { + List(typicalError("--charset", "CSV, JSON, XML and COBOL")) + } else { + List.empty + } + } + + private def checkXMLFields[R <: StandardizationParser[R]](config: R): List[String] = { + if (config.rowTag.isDefined && config.rawFormat != "xml") { + List(typicalError("--row-tag", "XML raw data")) + } else { + List.empty + } + } + + private def checkCSVFields[R <: StandardizationParser[R]](config: R): List[String] = { + def csvFieldsThatShouldNotBePresent(config: R): List[String] = { + val format = "CSV" + val definedFields = Map( + typicalError("--delimiter", format) -> config.csvDelimiter.isDefined, + typicalError("--escape", format) -> config.csvEscape.isDefined, + typicalError("--header", s"$format raw data") -> config.csvHeader.contains(true), + typicalError("--quote", format) -> config.csvQuote.isDefined + ) + definedFields.filter { case (_, value) => value }.keys.toList + } + + if (config.rawFormat == "csv") { + List.empty + } else { + csvFieldsThatShouldNotBePresent(config) + } + } + + private def checkCobolFields[R <: StandardizationParser[R]](config: R): Seq[String] = { + def cobolFieldsThatShouldNotBePresent(cobolOptions: CobolOptions): List[String] = { + val format = "COBOL" + val definedFields = Map( + typicalError("--copybook", format) -> (cobolOptions.copybook != ""), + typicalError("--cobol-encoding", format) -> cobolOptions.encoding.isDefined, + typicalError("--is-xcom", format) -> cobolOptions.isXcom, + typicalError("--is-text", format) -> cobolOptions.isText + ) + definedFields.filter { case (_, value) => value }.keys.toList + } + + + if (config.rawFormat == "cobol") { + List.empty + } else { + config.cobolOptions + .map(cobolFieldsThatShouldNotBePresent) + .getOrElse(List.empty) + } + } + + private def checkConfigX[R <: StandardizationParser[R]](config: R, builder: OParserBuilder[R]): Either[String, Unit] = { + val allErrors:List[String] = checkCharset(config) ++ + checkXMLFields(config) ++ + checkCSVFields(config) ++ + checkCobolFields(config) + + if (allErrors.isEmpty) { + builder.success + } else { + builder.failure(allErrors.mkString("\n")) + } + } + +} diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/ConfConfigSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/config/ConformanceParserSuite.scala similarity index 82% rename from spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/ConfConfigSuite.scala rename to spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/config/ConformanceParserSuite.scala index 3f13b2b04..76d2eb664 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/ConfConfigSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/config/ConformanceParserSuite.scala @@ -13,16 +13,17 @@ * limitations under the License. */ -package za.co.absa.enceladus.conformance +package za.co.absa.enceladus.conformance.config import java.time.ZonedDateTime import org.scalatest.FunSuite +import za.co.absa.enceladus.conformance.ConformanceExecution import za.co.absa.enceladus.dao.auth.{MenasKerberosCredentials, MenasPlainCredentials} import za.co.absa.enceladus.model.Dataset import za.co.absa.enceladus.utils.testUtils.SparkTestBase -class ConfConfigSuite extends FunSuite with SparkTestBase { +class ConformanceParserSuite extends FunSuite with SparkTestBase { private val year = "2018" private val month = "12" @@ -49,11 +50,12 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { private val disabled = false private val dateDisabled = None private val userDisabled = None - private val rawFormat = "parquet" private val folderPrefix = s"year=$year/month=$month/day=$day" private val infoDateColumn = "enceladus_info_date" private val infoVersionColumn = "enceladus_info_version" + private object TestDynamicConformance extends ConformanceExecution + test("Test credentials file parsing "){ val credentials = MenasPlainCredentials.fromFile(menasCredentialsFile) @@ -68,7 +70,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { } test("folder-prefix parameter") { - val cmdConfigNoFolderPrefix = ConfCmdConfig.getCmdLineArguments( + val cmdConfigNoFolderPrefix = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -86,7 +88,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { assert(cmdConfigNoFolderPrefix.publishPathOverride.isEmpty) assert(actualPlainMenasCredentials === menasCredentials) - val cmdConfigFolderPrefix = ConfCmdConfig.getCmdLineArguments( + val cmdConfigFolderPrefix = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -106,7 +108,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { assert(cmdConfigFolderPrefix.publishPathOverride.isEmpty) assert(actualMenasKerberosCredentials === menasKeytab) - val cmdConfigPublishPathOverrideAndFolderPrefix = ConfCmdConfig.getCmdLineArguments( + val cmdConfigPublishPathOverrideAndFolderPrefix = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -144,7 +146,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { userDisabled, List() ) - val cmdConfigNoFolderPrefix = ConfCmdConfig.getCmdLineArguments( + val cmdConfigNoFolderPrefix = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -152,7 +154,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { "--report-version", reportVersion.toString, "--menas-credentials-file", menasCredentialsFile )) - val cmdConfigFolderPrefix = ConfCmdConfig.getCmdLineArguments( + val cmdConfigFolderPrefix = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -160,7 +162,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { "--report-version", reportVersion.toString, "--menas-credentials-file", menasCredentialsFile, "--folder-prefix", folderPrefix)) - val cmdConfigPublishPathOverride = ConfCmdConfig.getCmdLineArguments( + val cmdConfigPublishPathOverride = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -168,7 +170,7 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { "--report-version", reportVersion.toString, "--menas-credentials-file", menasCredentialsFile, "--debug-set-publish-path", hdfsPublishPathOverride)) - val cmdConfigPublishPathOverrideAndFolderPrefix = ConfCmdConfig.getCmdLineArguments( + val cmdConfigPublishPathOverrideAndFolderPrefix = ConformanceConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -177,17 +179,18 @@ class ConfConfigSuite extends FunSuite with SparkTestBase { "--folder-prefix", folderPrefix, "--menas-credentials-file", menasCredentialsFile, "--debug-set-publish-path", hdfsPublishPathOverride)) - val publishPathNoFolderPrefix = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn, - cmdConfigNoFolderPrefix, conformanceDataset, cmdConfigNoFolderPrefix.reportVersion.get) + val publishPathNoFolderPrefix = TestDynamicConformance.buildPublishPath(cmdConfigNoFolderPrefix, + conformanceDataset, cmdConfigNoFolderPrefix.reportVersion.get) assert(publishPathNoFolderPrefix === s"$hdfsPublishPath/$infoDateColumn=$reportDate/$infoVersionColumn=$reportVersion") - val publishPathFolderPrefix = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn, - cmdConfigFolderPrefix, conformanceDataset, cmdConfigFolderPrefix.reportVersion.get) + val publishPathFolderPrefix = TestDynamicConformance.buildPublishPath(cmdConfigFolderPrefix, + conformanceDataset, cmdConfigFolderPrefix.reportVersion.get) assert(publishPathFolderPrefix === s"$hdfsPublishPath/$folderPrefix/$infoDateColumn=$reportDate/$infoVersionColumn=$reportVersion") - val publishPathPublishPathOverride = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn, - cmdConfigPublishPathOverride, conformanceDataset, cmdConfigPublishPathOverride.reportVersion.get) + val publishPathPublishPathOverride = TestDynamicConformance.buildPublishPath(cmdConfigPublishPathOverride, conformanceDataset, cmdConfigPublishPathOverride.reportVersion.get) assert(publishPathPublishPathOverride === hdfsPublishPathOverride) - val publishPathPublishPathOverrideAndFolderPrefix = DynamicConformanceJob.buildPublishPath(infoDateColumn, infoVersionColumn, - cmdConfigPublishPathOverrideAndFolderPrefix, conformanceDataset, cmdConfigPublishPathOverrideAndFolderPrefix.reportVersion.get) + + val publishPathPublishPathOverrideAndFolderPrefix = + TestDynamicConformance.buildPublishPath(cmdConfigPublishPathOverrideAndFolderPrefix, + conformanceDataset, cmdConfigPublishPathOverrideAndFolderPrefix.reportVersion.get) assert(publishPathPublishPathOverrideAndFolderPrefix === hdfsPublishPathOverride) } diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala index cb3155a71..dcb6e46a6 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ArrayConformanceSuite.scala @@ -18,7 +18,7 @@ package za.co.absa.enceladus.conformance.interpreter import org.apache.spark.sql.functions._ import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.{BeforeAndAfterAll, FunSuite} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.conformance.samples._ @@ -30,7 +30,7 @@ class ArrayConformanceSuite extends FunSuite with SparkTestBase with BeforeAndAf // spark.enableControlFrameworkTracking() implicit var dao: MenasDAO = _ - implicit var progArgs: ConfCmdConfig = _ + implicit var progArgs: ConformanceConfig = _ private val enableCF = false private val isCatalystWorkaroundEnabled = true @@ -40,7 +40,7 @@ class ArrayConformanceSuite extends FunSuite with SparkTestBase with BeforeAndAf val mapDF = spark.createDataFrame(MappingsSamples.mapping) dao = mock(classOf[MenasDAO]) - progArgs = new ConfCmdConfig(reportDate = "2017-11-01") + progArgs = new ConformanceConfig(reportDate = "2017-11-01") mockWhen(dao.getMappingTable("mapping", 0)) thenReturn MappingsSamples.mappingTable diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala index be75f9317..2d4873474 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/ChorusMockSuite.scala @@ -17,7 +17,7 @@ package za.co.absa.enceladus.conformance.interpreter import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.FunSuite -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.MappingConformanceRule @@ -43,7 +43,7 @@ class ChorusMockSuite extends FunSuite with SparkTestBase with LoggerTestBase { val inputDf = spark.createDataFrame(d) val mappingDf = spark.createDataFrame(mapping) - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2018-03-23") // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2018-03-23") // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = mock(classOf[MenasDAO]) // you may have to hard-code your own implementation here (if not working with menas) val enableCF = false val isCatalystWorkaroundEnabled = true diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala index f4422d44d..cd8e28e50 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/InterpreterSuite.scala @@ -20,7 +20,7 @@ import org.json4s.native.JsonParser._ import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.{BeforeAndAfterAll, FunSuite} import za.co.absa.atum.model.ControlMeasure -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.conformance.samples._ @@ -49,7 +49,8 @@ class InterpreterSuite extends FunSuite with SparkTestBase with BeforeAndAfterAl spark.sessionState.conf.setConfString("co.za.absa.enceladus.confTest", "hello :)") implicit val dao: MenasDAO = mock(classOf[MenasDAO]) - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01", experimentalMappingRule = Option(useExperimentalMappingRule)) + implicit val progArgs: ConformanceConfig = ConformanceConfig( + experimentalMappingRule = Option(useExperimentalMappingRule),reportDate = "2017-11-01") val enableCF = true val isCatalystWorkaroundEnabled = true @@ -104,7 +105,9 @@ class InterpreterSuite extends FunSuite with SparkTestBase with BeforeAndAfterAl spark.enableControlMeasuresTracking("src/test/testData/_tradeData/2017/11/01/_INFO", "src/test/testData/_tradeOutput/_INFO") implicit val dao: MenasDAO = mock(classOf[MenasDAO]) - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01", experimentalMappingRule = Option(useExperimentalMappingRule)) + implicit val progArgs: ConformanceConfig = ConformanceConfig( + experimentalMappingRule = Option(useExperimentalMappingRule), + reportDate = "2017-11-01") val enableCF = true val isCatalystWorkaroundEnabled = true diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala index 346613159..fa78f5a87 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/LiteralJoinMappingRuleTest.scala @@ -17,7 +17,7 @@ package za.co.absa.enceladus.conformance.interpreter import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.FunSuite -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{DropConformanceRule, LiteralConformanceRule, MappingConformanceRule} @@ -33,7 +33,7 @@ class LiteralJoinMappingRuleTest extends FunSuite with SparkTestBase with Logger val inputDf = spark.read.option("header", "true").csv("src/test/resources/interpreter/literalJoin/data") val mappingDf = spark.read.option("header", "true").csv("src/test/resources/interpreter/literalJoin/mapping") - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2018-03-23") + implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2018-03-23") implicit val dao: MenasDAO = mock(classOf[MenasDAO]) val enableCF = false val isCatalystWorkaroundEnabled = true diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala index 28f73a669..731a1ffab 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/NestedStructsFixture.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.{DataFrame, SaveMode} import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.{BeforeAndAfterAll, Suite} import org.slf4j.{Logger, LoggerFactory} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.datasource.DataSource import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset @@ -38,7 +38,7 @@ trait NestedStructsFixture extends BeforeAndAfterAll with SparkTestBase { protected var standardizedDf: DataFrame = _ implicit protected val dao: MenasDAO = mock(classOf[MenasDAO]) - implicit protected val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01") + implicit protected val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01") protected val upperRule1 = UppercaseConformanceRule(order = 1, inputColumn = "strings.with_new_lines", controlCheckpoint = false, outputColumn = "strings.with_new_lines_upper") diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala index d979b2fce..91f241c88 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/fixtures/StreamingFixture.scala @@ -15,22 +15,22 @@ package za.co.absa.enceladus.conformance.interpreter.fixtures -import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.execution.streaming.MemoryStream -import org.apache.spark.sql.streaming.Trigger +import org.apache.spark.sql.{DataFrame, Row} import org.scalatest.FunSuite import org.scalatest.mockito.MockitoSugar -import za.co.absa.enceladus.conformance.{ConfCmdConfig, HyperConformance} import za.co.absa.enceladus.conformance.interpreter.FeatureSwitches -import za.co.absa.enceladus.conformance.streaming.{InfoDateFactory, InfoDateLiteralFactory} +import za.co.absa.enceladus.conformance.streaming.InfoDateFactory +import za.co.absa.enceladus.conformance.HyperConformance +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset import za.co.absa.enceladus.utils.testUtils.SparkTestBase trait StreamingFixture extends FunSuite with SparkTestBase with MockitoSugar { implicit val menasBaseUrls: List[String] = List.empty - implicit val cmd: ConfCmdConfig = ConfCmdConfig.apply(reportVersion = Some(1)) + implicit val cmd: ConformanceConfig = ConformanceConfig(reportVersion = Some(1)) protected def testHyperConformance(input: DataFrame, sinkTableName: String, diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala index 9e24009ee..dbc56aba3 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/CastingRuleSuite.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql.types._ import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.FunSuite import org.slf4j.event.Level.ERROR -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches, RuleValidators} import za.co.absa.enceladus.conformance.samples.CastingRuleSamples import za.co.absa.enceladus.dao.MenasDAO @@ -36,7 +36,7 @@ class CastingRuleSuite extends FunSuite with SparkTestBase with LoggerTestBase { val inputDf = spark.read.schema(CastingRuleSamples.ordersSchema).json(CastingRuleSamples.ordersData.toDS) implicit val dao: MenasDAO = mock(classOf[MenasDAO]) - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01") + implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01") val experimentalMR = true val isCatalystWorkaroundEnabled = true val enableCF: Boolean = false diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala index 4662e2296..515738963 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/NegationRuleSuite.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql.Dataset import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.FunSuite import org.slf4j.event.Level.ERROR -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.conformance.samples.NegationRuleSamples import za.co.absa.enceladus.dao.MenasDAO @@ -110,7 +110,7 @@ class NegationRuleSuite extends FunSuite with SparkTestBase with LoggerTestBase{ val inputDf = spark.read.schema(schema).json(inputDataset) implicit val dao: MenasDAO = mock(classOf[MenasDAO]) - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01") + implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01") val experimentalMR = true val isCatalystWorkaroundEnabled = true val enableCF: Boolean = false diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala index fdfb5a208..725cd6172 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/RulesSuite.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.scalatest.FunSuite -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.ExplosionState import za.co.absa.enceladus.conformance.samples.EmployeeConformance import za.co.absa.enceladus.dao.MenasDAO @@ -32,7 +32,7 @@ class RulesSuite extends FunSuite with SparkTestBase { private val dummyInterpreter = new RuleInterpreter { override def conformanceRule: Option[ConformanceRule] = None def conform(df: Dataset[Row]) - (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = df + (implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = df } test("Test country code join condition") { diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala index c85de793f..4fd1bdb07 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/TestRuleBehaviors.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql.DataFrame import org.mockito.Mockito.{mock, when => mockWhen} import org.scalatest.FunSuite import org.slf4j.event.Level._ -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset @@ -30,7 +30,7 @@ trait TestRuleBehaviors extends FunSuite with SparkTestBase with LoggerTestBase def conformanceRuleShouldMatchExpected(inputDf: DataFrame, inputDataset: Dataset, expectedJSON: String) { implicit val dao: MenasDAO = mock(classOf[MenasDAO]) - implicit val progArgs: ConfCmdConfig = ConfCmdConfig(reportDate = "2017-11-01") + implicit val progArgs: ConformanceConfig = ConformanceConfig(reportDate = "2017-11-01") val experimentalMR = true val isCatalystWorkaroundEnabled = true val enableCF: Boolean = false diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala index 3dbde2dca..1121245ef 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/custom/CustomRuleSuite.scala @@ -19,7 +19,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.mockito.Mockito.mock import org.scalatest.FunSuite -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.rules.RuleInterpreter import za.co.absa.enceladus.conformance.interpreter.{DynamicInterpreter, ExplosionState, FeatureSwitches} import za.co.absa.enceladus.dao.MenasDAO @@ -41,7 +41,7 @@ case class MyCustomRule( case class MyCustomRuleInterpreter(rule: MyCustomRule) extends RuleInterpreter { override def conformanceRule: Option[ConformanceRule] = Some(rule) - def conform(df: Dataset[Row])(implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConfCmdConfig): Dataset[Row] = { + def conform(df: Dataset[Row])(implicit spark: SparkSession, explosionState: ExplosionState, dao: MenasDAO, progArgs: ConformanceConfig): Dataset[Row] = { import spark.implicits._ // we have to do this if this rule is to support arrays handleArrays(rule.outputColumn, df) { flattened => @@ -60,7 +60,7 @@ class CustomRuleSuite extends FunSuite with SparkTestBase { // we may WANT to enable control framework & spline here - implicit val progArgs: ConfCmdConfig = ConfCmdConfig() // here we may need to specify some parameters (for certain rules) + implicit val progArgs: ConformanceConfig = ConformanceConfig() // here we may need to specify some parameters (for certain rules) implicit val dao: MenasDAO = mock(classOf[MenasDAO]) // you may have to hard-code your own implementation here (if not working with menas) val experimentalMR = true val isCatalystWorkaroundEnabled = true diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala index ee264ed31..fbf547f10 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/NestedTestCaseFactory.scala @@ -19,7 +19,7 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, types} import org.mockito.Mockito.{mock, when => mockWhen} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{Always, FeatureSwitches, Never} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, MappingConformanceRule} @@ -228,14 +228,14 @@ class NestedTestCaseFactory(implicit spark: SparkSession) { */ def getTestCase(experimentalMappingRule: Boolean, enableMappingRuleBroadcasting: Boolean, - conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConfCmdConfig, FeatureSwitches) = { + conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConformanceConfig, FeatureSwitches) = { val inputDf = spark.read .schema(testCaseSchema) .json(getClass.getResource("/interpreter/mappingCases/nestedDf.json").getPath) val dataset = getDataSetWithConformanceRules(testCaseDataset, conformanceRules: _*) - val cmdConfig = ConfCmdConfig(reportDate = reportDate) + val cmdConfig = ConformanceConfig(reportDate = reportDate) val dao = mock(classOf[MenasDAO]) mockWhen(dao.getDataset(testCaseName, 1)) thenReturn testCaseDataset diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala index 1eae015de..84f508c1b 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/conformance/interpreter/rules/testcasefactories/SimpleTestCaseFactory.scala @@ -19,7 +19,7 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.mockito.Mockito.{mock, when => mockWhen} -import za.co.absa.enceladus.conformance.ConfCmdConfig +import za.co.absa.enceladus.conformance.config.ConformanceConfig import za.co.absa.enceladus.conformance.interpreter.{Always, FeatureSwitches, Never} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.conformanceRule.{ConformanceRule, MappingConformanceRule} @@ -138,10 +138,10 @@ class SimpleTestCaseFactory(implicit spark: SparkSession) { */ def getTestCase(experimentalMappingRule: Boolean, enableMappingRuleBroadcasting: Boolean, - conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConfCmdConfig, FeatureSwitches) = { + conformanceRules: ConformanceRule*): (DataFrame, Dataset, MenasDAO, ConformanceConfig, FeatureSwitches) = { val inputDf = spark.read.schema(testCaseSchema).json(testCaseDataJson.toDS) val dataset = getDataSetWithConformanceRules(testCaseDataset, conformanceRules: _*) - val cmdConfig = ConfCmdConfig(reportDate = reportDate) + val cmdConfig = ConformanceConfig(reportDate = reportDate) val dao = mock(classOf[MenasDAO]) mockWhen(dao.getDataset(testCaseName, 1)) thenReturn testCaseDataset diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala index c114f2a49..2081ac3ef 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolAsciiSuite.scala @@ -23,6 +23,7 @@ import org.scalatest.mockito.MockitoSugar import org.scalatest.{Outcome, fixture} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.standardization.fixtures.TempFileFixture import za.co.absa.enceladus.utils.testUtils.SparkTestBase @@ -32,6 +33,8 @@ class StandardizationCobolAsciiSuite extends fixture.FunSuite with SparkTestBase private implicit val dao: MenasDAO = mock[MenasDAO] + private val standardizationReader = new StandardizationPropertiesProvider() + private val tmpFilePrefix = "cobol-fix-ascii-" private val tmpFileSuffix = ".dat" @@ -69,8 +72,8 @@ class StandardizationCobolAsciiSuite extends fixture.FunSuite with SparkTestBase private def getTestDataFrame(tmpFileName: String, args: Array[String] ): DataFrame = { - val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(argumentsBase ++ args) - val cobolReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet, schema.fields.length) + val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(argumentsBase ++ args) + val cobolReader = standardizationReader.getFormatSpecificReader(cmd, dataSet, schema.fields.length) cobolReader .option("copybook_contents", copybook) .load(tmpFileName) diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala index 46e0b6fd2..59a3ec86c 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationCobolEbcdicSuite.scala @@ -21,6 +21,7 @@ import org.scalatest.mockito.MockitoSugar import org.scalatest.{Outcome, fixture} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.standardization.fixtures.TempFileFixture import za.co.absa.enceladus.utils.testUtils.SparkTestBase @@ -30,6 +31,8 @@ class StandardizationCobolEbcdicSuite extends fixture.FunSuite with SparkTestBas private implicit val dao: MenasDAO = mock[MenasDAO] + private val standardizationReader = new StandardizationPropertiesProvider() + private val tmpFilePrefix = "cobol-fix-ebcdic-" private val tmpFileSuffix = ".dat" @@ -70,8 +73,8 @@ class StandardizationCobolEbcdicSuite extends fixture.FunSuite with SparkTestBas private def getTestDataFrame(tmpFileName: String, args: Array[String] ): DataFrame = { - val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(argumentsBase ++ args) - val cobolReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet, schema.fields.length) + val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(argumentsBase ++ args) + val cobolReader = standardizationReader.getFormatSpecificReader(cmd, dataSet, schema.fields.length) cobolReader .option("copybook_contents", copybook) .load(tmpFileName) diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala index e77298cb0..44e51f04a 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationFixedWidthSuite.scala @@ -1,10 +1,27 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package za.co.absa.enceladus.standardization import org.apache.spark.sql.types.{DataType, StructType} import org.scalatest.FunSuite import org.scalatest.mockito.MockitoSugar +import org.slf4j.{Logger, LoggerFactory} import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator import za.co.absa.enceladus.utils.fs.FileReader @@ -14,7 +31,7 @@ import za.co.absa.enceladus.utils.udf.UDFLibrary class StandardizationFixedWidthSuite extends FunSuite with SparkTestBase with MockitoSugar{ private implicit val udfLibrary:UDFLibrary = new UDFLibrary() - + private val log: Logger = LoggerFactory.getLogger(this.getClass) private val argsBase = ("--dataset-name Foo --dataset-version 1 --report-date 2020-06-22 --report-version 1 " + "--menas-auth-keytab src/test/resources/user.keytab.example " + "--raw-format fixed-width").split(" ") @@ -28,9 +45,9 @@ class StandardizationFixedWidthSuite extends FunSuite with SparkTestBase with Mo ).asInstanceOf[StructType] test("Reading data from FixedWidth input") { - val cmd = StdCmdConfig.getCmdLineArguments(argsBase) + val cmd = StandardizationConfig.getFromArguments(argsBase) - val fixedWidthReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet) + val fixedWidthReader = new StandardizationPropertiesProvider().getFormatSpecificReader(cmd, dataSet) val inputSchema = PlainSchemaGenerator.generateInputSchema(baseSchema) val reader = fixedWidthReader.schema(inputSchema) @@ -47,9 +64,9 @@ class StandardizationFixedWidthSuite extends FunSuite with SparkTestBase with Mo } test("Reading data from FixedWidth input trimmed") { - val cmd = StdCmdConfig.getCmdLineArguments(argsBase ++ Array("--trimValues", "true")) + val cmd = StandardizationConfig.getFromArguments(argsBase ++ Array("--trimValues", "true")) - val fixedWidthReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet) + val fixedWidthReader = new StandardizationPropertiesProvider().getFormatSpecificReader(cmd, dataSet) val inputSchema = PlainSchemaGenerator.generateInputSchema(baseSchema) val reader = fixedWidthReader.schema(inputSchema) diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala index f5cd49663..278076f22 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationJsonSuite.scala @@ -18,8 +18,10 @@ package za.co.absa.enceladus.standardization import org.apache.spark.sql.types.{DataType, StructType} import org.scalatest.FunSuite import org.scalatest.mockito.MockitoSugar +import org.slf4j.Logger import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter import za.co.absa.enceladus.standardization.interpreter.stages.PlainSchemaGenerator import za.co.absa.enceladus.utils.fs.FileReader @@ -30,6 +32,8 @@ import za.co.absa.enceladus.utils.udf.UDFLibrary class StandardizationJsonSuite extends FunSuite with SparkTestBase with MockitoSugar{ private implicit val udfLibrary:UDFLibrary = new UDFLibrary() + private val standardizationReader = new StandardizationPropertiesProvider() + test("Reading data from JSON input, also such that don't adhere to desired schema") { implicit val dao: MenasDAO = mock[MenasDAO] @@ -39,9 +43,9 @@ class StandardizationJsonSuite extends FunSuite with SparkTestBase with MockitoS "--raw-format json").split(" ") val dataSet = Dataset("SpecialChars", 1, None, "", "", "SpecialChars", 1, conformance = Nil) - val cmd = StdCmdConfig.getCmdLineArguments(args) + val cmd = StandardizationConfig.getFromArguments(args) - val csvReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet) + val csvReader = standardizationReader.getFormatSpecificReader(cmd, dataSet) val baseSchema: StructType = DataType.fromJson( FileReader.readFileAsString("src/test/resources/data/standardization_json_suite_schema.json") diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala index cf502913a..0e758922e 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationParquetSuite.scala @@ -21,9 +21,11 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types._ import org.scalatest.mockito.MockitoSugar import org.scalatest.{Outcome, fixture} +import org.slf4j.Logger import za.co.absa.enceladus.common.RecordIdGeneration.IdType import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.standardization.fixtures.TempFileFixture import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter import za.co.absa.enceladus.standardization.interpreter.stages.TypeParserException @@ -38,6 +40,7 @@ class StandardizationParquetSuite extends fixture.FunSuite with SparkTestBase wi import spark.implicits._ import za.co.absa.enceladus.utils.implicits.DataFrameImplicits.DataFrameEnhancements + private val standardizationReader = new StandardizationPropertiesProvider() private implicit val dao: MenasDAO = mock[MenasDAO] private implicit val udfLibrary:UDFLibrary = new UDFLibrary() @@ -62,9 +65,9 @@ class StandardizationParquetSuite extends fixture.FunSuite with SparkTestBase wi /** Creates a dataframe from an input file name path and command line arguments to Standardization */ private def getTestDataFrame(tmpFileName: String, - args: Array[String]): (StdCmdConfig, DataFrame) = { - val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args) - val csvReader = StandardizationJob.getFormatSpecificReader(cmd, dataSet) + args: Array[String]): (StandardizationConfig, DataFrame) = { + val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args) + val csvReader = standardizationReader.getFormatSpecificReader(cmd, dataSet) (cmd, csvReader.load(tmpFileName).orderBy("id")) } diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala index a261214e0..a57e50a70 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StandardizationRerunSuite.scala @@ -22,8 +22,10 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.scalatest.mockito.MockitoSugar import org.scalatest.{Outcome, fixture} +import org.slf4j.Logger import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.standardization.fixtures.TempFileFixture import za.co.absa.enceladus.standardization.interpreter.StandardizationInterpreter import za.co.absa.enceladus.utils.error.ErrorMessage @@ -38,6 +40,8 @@ class StandardizationRerunSuite extends fixture.FunSuite with SparkTestBase with private implicit val udfLib: UDFLibrary = new UDFLibrary private implicit val dao: MenasDAO = mock[MenasDAO] + private val standardizationReader = new StandardizationPropertiesProvider() + private val tmpDirPrefix = "StdRerunTest" private val tmpFilePrefix = "test-input-" private val tmpFileSuffix = ".csv" @@ -65,8 +69,8 @@ class StandardizationRerunSuite extends fixture.FunSuite with SparkTestBase with "--menas-auth-keytab src/test/resources/user.keytab.example " + "--raw-format csv --header false --delimiter |").split(" ") - val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args) - StandardizationJob + val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args) + standardizationReader .getFormatSpecificReader(cmd, dataSet, schemaWithStringType.fields.length) .schema(schemaWithStringType) .load(tmpFileName) diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StdConfigSuite.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/config/StandardizationParserSuite.scala similarity index 82% rename from spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StdConfigSuite.scala rename to spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/config/StandardizationParserSuite.scala index ffa432660..524e0180b 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/StdConfigSuite.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/config/StandardizationParserSuite.scala @@ -13,16 +13,17 @@ * limitations under the License. */ -package za.co.absa.enceladus.standardization +package za.co.absa.enceladus.standardization.config import java.time.ZonedDateTime import org.scalatest.FunSuite import za.co.absa.enceladus.dao.auth.{MenasKerberosCredentials, MenasPlainCredentials} import za.co.absa.enceladus.model.Dataset +import za.co.absa.enceladus.standardization.StandardizationExecution import za.co.absa.enceladus.utils.testUtils.SparkTestBase -class StdConfigSuite extends FunSuite with SparkTestBase { +class StandardizationParserSuite extends FunSuite with SparkTestBase { private val year = "2018" private val month = "12" @@ -52,6 +53,8 @@ class StdConfigSuite extends FunSuite with SparkTestBase { private val rawFormat = "parquet" private val folderPrefix = s"year=$year/month=$month/day=$day" + private object TestStandardization extends StandardizationExecution + test("Test credentials file parsing "){ val credentials = MenasPlainCredentials.fromFile(menasCredentialsFile) @@ -66,7 +69,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase { } test("folder-prefix parameter") { - val cmdConfigNoFolderPrefix = StdCmdConfig.getCmdLineArguments( + val cmdConfigNoFolderPrefix = StandardizationConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -86,7 +89,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase { assert(cmdConfigNoFolderPrefix.rawPathOverride.isEmpty) assert(actualPlainMenasCredentials === menasCredentials) - val cmdConfigFolderPrefix = StdCmdConfig.getCmdLineArguments( + val cmdConfigFolderPrefix = StandardizationConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -127,7 +130,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase { userDisabled, List() ) - val cmdConfigNoFolderPrefix = StdCmdConfig.getCmdLineArguments( + val cmdConfigNoFolderPrefix = StandardizationConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -135,7 +138,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase { "--report-version", reportVersion.toString, "--menas-credentials-file", menasCredentialsFile, "--raw-format", rawFormat)) - val cmdConfigFolderPrefix = StdCmdConfig.getCmdLineArguments( + val cmdConfigFolderPrefix = StandardizationConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -144,7 +147,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase { "--menas-credentials-file", menasCredentialsFile, "--folder-prefix", folderPrefix, "--raw-format", rawFormat)) - val cmdConfigRawPathOverride = StdCmdConfig.getCmdLineArguments( + val cmdConfigRawPathOverride = StandardizationConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -153,7 +156,7 @@ class StdConfigSuite extends FunSuite with SparkTestBase { "--menas-credentials-file", menasCredentialsFile, "--debug-set-raw-path", hdfsRawPathOverride, "--raw-format", rawFormat)) - val cmdConfigRawPathOverrideAndFolderPrefix = StdCmdConfig.getCmdLineArguments( + val cmdConfigRawPathOverrideAndFolderPrefix = StandardizationConfig.getFromArguments( Array( "--dataset-name", datasetName, "--dataset-version", datasetVersion.toString, @@ -164,18 +167,17 @@ class StdConfigSuite extends FunSuite with SparkTestBase { "--debug-set-raw-path", hdfsRawPathOverride, "--raw-format", rawFormat)) - - val publishPathNoFolderPrefix = StandardizationJob.buildRawPath(cmdConfigNoFolderPrefix, standardiseDataset, - dateTokens, cmdConfigNoFolderPrefix.reportVersion.get) + val publishPathNoFolderPrefix = TestStandardization.buildRawPath(cmdConfigNoFolderPrefix, standardiseDataset, + cmdConfigNoFolderPrefix.reportVersion.get) assert(publishPathNoFolderPrefix === s"${standardiseDataset.hdfsPath}/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v${cmdConfigNoFolderPrefix.reportVersion.get}") - val publishPathFolderPrefix = StandardizationJob.buildRawPath(cmdConfigFolderPrefix, standardiseDataset, - dateTokens, cmdConfigFolderPrefix.reportVersion.get) + val publishPathFolderPrefix = TestStandardization.buildRawPath(cmdConfigFolderPrefix, standardiseDataset, + cmdConfigFolderPrefix.reportVersion.get) assert(publishPathFolderPrefix === s"${standardiseDataset.hdfsPath}/$folderPrefix/${dateTokens(0)}/${dateTokens(1)}/${dateTokens(2)}/v${cmdConfigFolderPrefix.reportVersion.get}") - val publishPathRawPathOverride = StandardizationJob.buildRawPath(cmdConfigRawPathOverride, standardiseDataset, - dateTokens, cmdConfigRawPathOverride.reportVersion.get) + val publishPathRawPathOverride = TestStandardization.buildRawPath(cmdConfigRawPathOverride, standardiseDataset, + cmdConfigRawPathOverride.reportVersion.get) assert(publishPathRawPathOverride === hdfsRawPathOverride) - val publishPathRawPathOverrideAndFolderPrefix = StandardizationJob.buildRawPath(cmdConfigRawPathOverrideAndFolderPrefix, - standardiseDataset, dateTokens, cmdConfigRawPathOverrideAndFolderPrefix.reportVersion.get) + val publishPathRawPathOverrideAndFolderPrefix = TestStandardization.buildRawPath(cmdConfigRawPathOverrideAndFolderPrefix, + standardiseDataset, cmdConfigRawPathOverrideAndFolderPrefix.reportVersion.get) assert(publishPathRawPathOverrideAndFolderPrefix === hdfsRawPathOverride) } diff --git a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala index e54cb9036..2d59a6dad 100644 --- a/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala +++ b/spark-jobs/src/test/scala/za/co/absa/enceladus/standardization/fixtures/CsvFileFixture.scala @@ -17,16 +17,19 @@ package za.co.absa.enceladus.standardization.fixtures import java.io.File import java.nio.charset.{Charset, StandardCharsets} + import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.scalatest.mockito.MockitoSugar import za.co.absa.enceladus.dao.MenasDAO import za.co.absa.enceladus.model.Dataset -import za.co.absa.enceladus.standardization.{StandardizationJob, StdCmdConfig} +import za.co.absa.enceladus.standardization.StandardizationPropertiesProvider +import za.co.absa.enceladus.standardization.config.StandardizationConfig import za.co.absa.enceladus.utils.testUtils.SparkTestBase trait CsvFileFixture extends MockitoSugar with TempFileFixture with SparkTestBase { private implicit val dao: MenasDAO = mock[MenasDAO] + private val standardizationReader = new StandardizationPropertiesProvider() type FixtureParam = String private val tmpFilePrefix = "special-characters" @@ -57,11 +60,11 @@ trait CsvFileFixture extends MockitoSugar with TempFileFixture with SparkTestBas dataSet: Dataset, schema: StructType ): DataFrame = { - val cmd: StdCmdConfig = StdCmdConfig.getCmdLineArguments(args) + val cmd: StandardizationConfig = StandardizationConfig.getFromArguments(args) val csvReader = if (checkMaxColumns) { - StandardizationJob.getFormatSpecificReader(cmd, dataSet, schema.fields.length) + standardizationReader.getFormatSpecificReader(cmd, dataSet, schema.fields.length) } else { - StandardizationJob.getFormatSpecificReader(cmd, dataSet) + standardizationReader.getFormatSpecificReader(cmd, dataSet) } csvReader .schema(schema) diff --git a/utils/src/main/scala/za/co/absa/enceladus/utils/modules/SourcePhase.scala b/utils/src/main/scala/za/co/absa/enceladus/utils/modules/SourcePhase.scala new file mode 100644 index 000000000..643b2a85e --- /dev/null +++ b/utils/src/main/scala/za/co/absa/enceladus/utils/modules/SourcePhase.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2018 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.enceladus.utils.modules + +/** + * Stands to represent the source part (standardization or conformance) regardless of the Job class + */ +sealed trait SourcePhase { + val value: String + + def asIdentifier: String = value.toLowerCase +} + +object SourcePhase { + def withIdentifier(name: String): SourcePhase = { + name match { + case "conformance" => SourcePhase.Conformance + case "standardization" => SourcePhase.Standardization + case _ => throw new NoSuchElementException(s"No value found for '$name'") + } + } + + case object Standardization extends SourcePhase { + val value = "Standardization" + } + + case object Conformance extends SourcePhase { + val value = "Conformance" + } +}