forked from YotpoLtd/metorikku
-
Notifications
You must be signed in to change notification settings - Fork 0
/
job_config_sample.yaml
200 lines (184 loc) · 5.91 KB
/
job_config_sample.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# ---- SAMPLE YAML OF JOB CONFIG FILE ---- #
# !MANDATORY! Metrics and Metrics directories be executed
metrics:
- /path/to/metric-1
- /path/to/metric-2
# Input configuration
inputs:
input_1:
file:
path: parquet/input_1.parquet
input_2:
file:
# The path of the file, you can add multiple files with ,
path: json/input_2.csv
# Optional, if omitted we'll guess by the extension (fallback to parquet)
format: csv
# Optional, define custom schema via a json schema file (https://json-schema.org/)
schemaPath: schema/schema.json
# Optional send any spark supported option to the reader
options:
quoteAll: false
# Optional define stream reader that can be used to read streaming data.
isStream: true
input_3:
file_date_range:
template: parquet/%s/input_1.parquet
date_range:
format: yyyy/MM/dd
startDate: 2017/09/01
endDate: 2017/09/03
# Below are optional (check out the file input example above)
format: parquet
schemaPath: schema/schema.json
options:
opt: val
input_4:
jdbc:
connectionUrl: jdbc:mysql://localhost/db?zeroDateTimeBehavior=convertToNull
user: user
password: pass
table: some_table
# You can optionally add here any supported option from https://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases
options:
numPartitions: 100
driver: com.mysql.jdbc.Driver
input_5:
kafka:
servers:
- localhost:9092
topic: some_topic
schemaRegistryUrl: https://schema-registry-url # optional
schemaSubject: subject # optional
input_6:
cassandra:
host: 127.0.0.1
user: user
password: password
table: table
keySpace: keySpace
options:
input_7:
elasticsearch:
nodes: localhost:9200
user: user
password: password
index: index
input_8:
mongo:
uri: mongodb://localhost:27017
database: test
collection: users
# Set custom variables that would be accessible from the SQL
variables:
StartDate: 2017/09/01
EndDate: 2017/09/20
TrimmedDateFormat: yyyy/MM/dd
output:
# elasticsearch Database argument: (host:port) specifying host (under nodes option) is mandatory.
elasticsearch:
nodes: localhost:9200
user: user
password: password
# cassandra Database arguments: host is mandatory. username and password are supported
cassandra:
host: example.cassandra.db
username: user
password: password
# Redshift Database arguments: jdbcURL and tempS3Dir are mandatory.
redshift:
jdbcURL: jdbc:redshift://<IP>:<PORT>/file?user=username&password=pass
tempS3Dir: s3://path/to/redshift/temp/dir/
# Redis Database arguments: host is mandatory. port, auth and db are supported
redis:
host: hostname
port: port-number
auth: authentication
db: database
# Segment API Key
segment:
apiKey: apikey
# Output file directory
file:
dir: /path/to/parquet/output
# JDBC database
jdbc:
connectionUrl: "jdbc:postgresql://localhost:5432/databasename"
user: username
password: password
driver: "org.postgresql.Driver"
# Apache Hudi
hudi:
dir: /path/to/parquet/output
# Optional: This controls the level of parallelism of hudi writing (should be similar to shuffle partitions) - (default is 1500)
parallelism: 1
# Optional: upsert/insert/bulkinsert (default is upsert)
operation: upsert
# Optional: COPY_ON_WRITE/MERGE_ON_READ (default is COPY_ON_WRITE)
storageType: COPY_ON_WRITE
# Optional: Maximum number of versions to retain
maxVersions: 1
# Optional: Hive database to use when writing (default is default)
hiveDB: default
# Hive server URL
hiveJDBCURL: jdbc:hive2://hive:10000
# Optional: credentials to hive
hiveUserName: root
hivePassword: pass
# Optional: toggle hudi hive sync
hiveSync: false
# Optional: enable metorikku to take control over the hive sync process (used in order to support Hive1)
manualHiveSync: true
# Optional: when manualHiveSync is enabled, you need to define your partitions manually here
manualHiveSyncPartitions:
part: 0
# Optional: extra options (http://hudi.incubator.apache.org/configurations.html)
options:
....
# You can also use named outputs (all outputs above are supported)
outputs:
fileDir1:
file:
dir: /path/to/parquet/output
fileDir2:
file:
dir: /path/to/parquet/output2
# If set to true, triggers Explain before saving
explain: true
# Shows a Preview of the output
showPreviewLines: 42
# Prints the query after running it
showQuery: true
# Caches the step before each preview
cacheOnPreview: true
# Set Log Level : ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
logLevel: WARN
# Set Application Name to have app name prefix in spark instrumentation counters
appName: appName
# Set instrumentation writer (default is spark metrics)
instrumentation:
influxdb:
url: http://localhost:8086
username: username
password: password
dbName: test
# Optionally set catalog parameters (for hive support)
catalog:
database: some_database
# Set options for streaming writing
streaming:
# Set the trigger mode (ProcessingTime, Once, Continuous)
triggerMode: ProcessingTime
# If trigger is ProcessingTime/Continuous set the trigger duration
triggerDuration: 10 seconds
# Possible values are append/replace/complete
outputMode: append
# Where to save Spark's checkpoint
checkpointLocation: /tmp/checkpoint
# Optionally set streaming to use foreachBatch when writing streams. this enable writing to all available writers and to write to multiple outputs.
batchMode: true
# Add any other options supported by the DataStreamWriter
extraOptions:
opt: val
# Optional: controls caching and counting on each output (default is true)
cacheCountOnOutput: false