Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
bundle:
name: wal-chain-test

resources:
jobs:
# Linear chain: job_01 -> job_02 -> ... -> job_10
# Execution order: job_01 first, job_10 last
job_01:
name: "job-01"
description: "first in chain"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_02:
name: "job-02"
description: "depends on ${resources.jobs.job_01.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_03:
name: "job-03"
description: "depends on ${resources.jobs.job_02.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_04:
name: "job-04"
description: "depends on ${resources.jobs.job_03.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_05:
name: "job-05"
description: "depends on ${resources.jobs.job_04.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_06:
name: "job-06"
description: "depends on ${resources.jobs.job_05.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_07:
name: "job-07"
description: "depends on ${resources.jobs.job_06.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_08:
name: "job-08"
description: "depends on ${resources.jobs.job_07.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_09:
name: "job-09"
description: "depends on ${resources.jobs.job_08.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
job_10:
name: "job-10"
description: "depends on ${resources.jobs.job_09.id}"
tasks:
- task_key: "task"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
5 changes: 5 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

73 changes: 73 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
=== First deploy (crashes on job_10) ===

>>> errcode [CLI] bundle deploy
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
Deploying resources...
[PROCESS_KILLED]

Exit code: [KILLED]

=== WAL content after crash ===
{"lineage":"[UUID]","serial": [SERIAL]}
{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}}
{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}}
{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}}
{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}}
{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}}
{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}}
{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}}
{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}}
{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}}

=== Number of jobs saved in WAL ===
9

=== Bundle summary (reads from WAL) ===
Name: wal-chain-test
Target: default
Workspace:
User: [USERNAME]
Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default
Resources:
Jobs:
job_01:
Name: job-01
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_02:
Name: job-02
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_03:
Name: job-03
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_04:
Name: job-04
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_05:
Name: job-05
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_06:
Name: job-06
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_07:
Name: job-07
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_08:
Name: job-08
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_09:
Name: job-09
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
job_10:
Name: job-10
URL: (not deployed)

=== Second deploy (recovery) ===

>>> [CLI] bundle deploy --force-lock
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!

=== WAL after successful deploy ===
WAL deleted (expected)
22 changes: 22 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
echo "=== First deploy (crashes on job_10) ==="
trace errcode $CLI bundle deploy

echo ""
echo "=== WAL content after crash ==="
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file"

echo ""
echo "=== Number of jobs saved in WAL ==="
grep -c '"k":"resources.jobs' .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "0"

echo ""
echo "=== Bundle summary (reads from WAL) ==="
$CLI bundle summary

echo ""
echo "=== Second deploy (recovery) ==="
trace $CLI bundle deploy --force-lock

echo ""
echo "=== WAL after successful deploy ==="
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)"
2 changes: 2 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Placeholder for Spark task
print("Hello from test job")
21 changes: 21 additions & 0 deletions acceptance/bundle/deploy/wal/chain-10-jobs/test.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Linear chain: job_01 -> job_02 -> ... -> job_10
# Let first 9 jobs/create succeed, then kill on the 10th

[[Server]]
Pattern = "POST /api/2.2/jobs/create"
KillCallerOffset = 9
KillCaller = 1
Response.Body = '{"job_id": 1001}'

[[Server]]
Pattern = "POST /api/2.2/jobs/reset"
Response.Body = '{}'

[[Server]]
Pattern = "GET /api/2.2/jobs/get"
Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}'

# Strip single-node cluster warnings
[[Repls]]
Old = '(?s)Warning: Single node cluster is not correctly configured.*?ResourceClass: SingleNode\n \n\n'
New = ''
25 changes: 25 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
bundle:
name: wal-corrupted-test

resources:
jobs:
valid_job:
name: "valid-job"
tasks:
- task_key: "task-a"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0
another_valid:
name: "another-valid"
tasks:
- task_key: "task-b"
spark_python_task:
python_file: ./test.py
new_cluster:
spark_version: 15.4.x-scala2.12
node_type_id: i3.xlarge
num_workers: 0

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

56 changes: 56 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
=== Creating state file with serial 5 ===
=== Creating WAL with corrupted entry ===
=== WAL content ===
{"lineage":"test-lineage-123","serial": [SERIAL]}
{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}}
not valid json - this line should be skipped
{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}}
=== Deploy (should recover valid entries, skip corrupted) ===

>>> [CLI] bundle deploy
Warning: Single node cluster is not correctly configured
at resources.jobs.another_valid.tasks[0].new_cluster
in databricks.yml:23:13

num_workers should be 0 only for single-node clusters. To create a
valid single node cluster please ensure that the following properties
are correctly set in the cluster specification:

spark_conf:
spark.databricks.cluster.profile: singleNode
spark.master: local[*]

custom_tags:
ResourceClass: SingleNode


Warning: Single node cluster is not correctly configured
at resources.jobs.valid_job.tasks[0].new_cluster
in databricks.yml:13:13

num_workers should be 0 only for single-node clusters. To create a
valid single node cluster please ensure that the following properties
are correctly set in the cluster specification:

spark_conf:
spark.databricks.cluster.profile: singleNode
spark.master: local[*]

custom_tags:
ResourceClass: SingleNode


Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files...
Deploying resources...
Updating deployment state...
Deployment complete!
=== Final state (should have recovered entries) ===
{
"serial": [SERIAL],
"state_keys": [
"resources.jobs.another_valid",
"resources.jobs.valid_job"
]
}
=== WAL after successful deploy ===
WAL deleted (expected)
35 changes: 35 additions & 0 deletions acceptance/bundle/deploy/wal/corrupted-wal-entry/script
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
echo "=== Creating state file with serial 5 ==="
mkdir -p .databricks/bundle/default
cat > .databricks/bundle/default/resources.json << 'EOF'
{
"state_version": 1,
"cli_version": "0.0.0",
"lineage": "test-lineage-123",
"serial": 5,
"state": {}
}
EOF

echo "=== Creating WAL with corrupted entry ==="
cat > .databricks/bundle/default/resources.json.wal << 'EOF'
{"lineage":"test-lineage-123","serial":6}
{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}}
not valid json - this line should be skipped
{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}}
EOF

echo "=== WAL content ==="
cat .databricks/bundle/default/resources.json.wal

echo "=== Deploy (should recover valid entries, skip corrupted) ==="
trace $CLI bundle deploy 2>&1 | python3 sort_warnings.py

echo "=== Final state (should have recovered entries) ==="
cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}'

echo "=== WAL after successful deploy ==="
if [ -f ".databricks/bundle/default/resources.json.wal" ]; then
echo "WAL exists (unexpected)"
else
echo "WAL deleted (expected)"
fi
Loading