config/job_config_sample.yaml

# ---- SAMPLE YAML OF JOB CONFIG FILE ---- #

# !MANDATORY! Metrics and Metrics directories be executed
metrics:
  - /path/to/metric-1
  - /path/to/metric-2

# Input configuration
inputs:
  input_1:
    file:
      path: parquet/input_1.parquet
  input_2:
    file:
      # The path of the file, you can add multiple files with ,
      path: json/input_2.csv
      # Optional, if omitted we'll guess by the extension (fallback to parquet)
      format: csv
      # Optional, define custom schema via a json schema file (https://json-schema.org/)
      schemaPath: schema/schema.json
      # Optional send any spark supported option to the reader
      options:
        quoteAll: false
      # Optional define stream reader that can be used to read streaming data.
      isStream: true
  input_3:
      file_date_range:
        template: parquet/%s/input_1.parquet
        date_range:
          format: yyyy/MM/dd
          startDate: 2017/09/01
          endDate: 2017/09/03
        # Below are optional (check out the file input example above)
        format: parquet
        schemaPath: schema/schema.json
        options:
          opt: val
  input_4:
    jdbc:
      connectionUrl: jdbc:mysql://localhost/db?zeroDateTimeBehavior=convertToNull
      user: user
      password: pass
      table: some_table
      # You can optionally add here any supported option from https://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases
      options:
        numPartitions: 100
        driver: com.mysql.jdbc.Driver
  input_5:
      kafka:
        servers:
          - localhost:9092
        topic: some_topic
        schemaRegistryUrl: https://schema-registry-url # optional
        schemaSubject: subject # optional
  input_6:
    cassandra:
      host: 127.0.0.1
      user: user
      password: password
      table: table
      keySpace: keySpace
      options:
  input_7:
    elasticsearch:
      nodes: localhost:9200
      user: user
      password: password
      index: index
  input_8:
    mongo:
      uri: mongodb://localhost:27017
      database: test
      collection: users


# Set custom variables that would be accessible from the SQL
variables:
 StartDate: 2017/09/01
 EndDate: 2017/09/20
 TrimmedDateFormat: yyyy/MM/dd

output:
  # elasticsearch Database argument: (host:port) specifying host (under nodes option) is mandatory.
  elasticsearch:
    nodes: localhost:9200
    user: user
    password: password
  # cassandra Database arguments: host is mandatory. username and password are supported
  cassandra:
    host: example.cassandra.db
    username: user
    password: password
  # Redshift Database arguments: jdbcURL and tempS3Dir are mandatory.
  redshift:
    jdbcURL: jdbc:redshift://<IP>:<PORT>/file?user=username&password=pass
    tempS3Dir: s3://path/to/redshift/temp/dir/
  # Redis Database arguments: host is mandatory. port, auth and db are supported
  redis:
    host: hostname
    port: port-number
    auth: authentication
    db: database
  # Segment API Key
  segment:
    apiKey: apikey
  # Output file directory
  file:
    dir: /path/to/parquet/output
  # JDBC database
  jdbc:
    connectionUrl: "jdbc:postgresql://localhost:5432/databasename"
    user: username
    password: password
    driver: "org.postgresql.Driver"
  # Apache Hudi
  hudi:
    dir: /path/to/parquet/output
    # Optional: This controls the level of parallelism of hudi writing (should be similar to shuffle partitions) - (default is 1500)
    parallelism: 1
    # Optional: upsert/insert/bulkinsert (default is upsert)
    operation: upsert
    # Optional: COPY_ON_WRITE/MERGE_ON_READ (default is COPY_ON_WRITE)
    storageType: COPY_ON_WRITE
    # Optional: Maximum number of versions to retain
    maxVersions: 1
    # Optional: Hive database to use when writing (default is default)
    hiveDB: default
    # Hive server URL
    hiveJDBCURL: jdbc:hive2://hive:10000
    # Optional: credentials to hive
    hiveUserName: root
    hivePassword: pass
    # Optional: toggle hudi hive sync
    hiveSync: false
    # Optional: enable metorikku to take control over the hive sync process (used in order to support Hive1)
    manualHiveSync: true
    # Optional: when manualHiveSync is enabled, you need to define your partitions manually here
    manualHiveSyncPartitions:
      part: 0
    # Optional: extra options (http://hudi.incubator.apache.org/configurations.html)
    options:
      ....

# You can also use named outputs (all outputs above are supported)
outputs:
  fileDir1:
    file:
      dir: /path/to/parquet/output
  fileDir2:
    file:
      dir: /path/to/parquet/output2


# If set to true, triggers Explain before saving
explain: true

# Shows a Preview of the output
showPreviewLines: 42

# Prints the query after running it
showQuery: true

# Caches the step before each preview
cacheOnPreview: true

# Set Log Level : ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
logLevel: WARN

# Set Application Name to have app name prefix in spark instrumentation counters
appName: appName

# Set instrumentation writer (default is spark metrics)
instrumentation:
  influxdb:
    url: http://localhost:8086
    username: username
    password: password
    dbName: test
# Optionally set catalog parameters (for hive support)
catalog:
  database: some_database

# Set options for streaming writing
streaming:
  # Set the trigger mode (ProcessingTime, Once, Continuous)
  triggerMode: ProcessingTime
  # If trigger is ProcessingTime/Continuous set the trigger duration
  triggerDuration: 10 seconds
  # Possible values are append/replace/complete
  outputMode: append
  # Where to save Spark's checkpoint
  checkpointLocation: /tmp/checkpoint
  # Optionally set streaming to use foreachBatch when writing streams. this enable writing to all available writers and to write to multiple outputs.
  batchMode: true
  # Add any other options supported by the DataStreamWriter
  extraOptions:
    opt: val

# Optional: controls caching and counting on each output (default is true)
cacheCountOnOutput: false