Skip to content

Commit

Permalink
fix(k8s): boot node long sync (#9610)
Browse files Browse the repository at this point in the history
If the boot node needs to catch up after a big sync, it can be killed by
the liveness probe before it has a chance to finish

Hit a major somewhat unexplained gotcha - for some reason, when
introducing a startupProbe, self-introspecting DNS did NOT work. I
couldn't find documentation for it, but it seems that DNS waits for
startupProbe. I agreed with Mitch that this works just fine using
POD_IP.
  • Loading branch information
ludamad authored Oct 31, 2024
1 parent 04dd2c4 commit 1b85840
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 36 deletions.
28 changes: 13 additions & 15 deletions spartan/aztec-network/templates/boot-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,16 @@ spec:
"-c",
"sleep 30 && source /shared/contracts.env && env && node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --node --archiver --sequencer --pxe",
]
startupProbe:
httpGet:
path: /status
port: {{ .Values.bootNode.service.nodePort }}
periodSeconds: {{ .Values.bootNode.startupProbe.periodSeconds }}
failureThreshold: {{ .Values.bootNode.startupProbe.failureThreshold }}
livenessProbe:
exec:
command:
- /bin/sh
- -c
- curl -fSs http://127.0.0.1:{{ .Values.bootNode.service.nodePort }}/status
httpGet:
path: /status
port: {{ .Values.bootNode.service.nodePort }}
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 30
Expand All @@ -89,16 +93,10 @@ spec:
subPath: contracts.env
{{- end }}
env:
- name: POD_NAME
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_DNS_NAME
value: "$(POD_NAME).{{ include "aztec-network.fullname" . }}-boot-node.$(POD_NAMESPACE).svc.cluster.local"
fieldPath: status.podIP
- name: PORT
value: "{{ .Values.bootNode.service.nodePort }}"
- name: LOG_LEVEL
Expand All @@ -123,13 +121,13 @@ spec:
{{- if .Values.bootNode.externalTcpHost }}
value: "{{ .Values.bootNode.externalTcpHost }}:{{ .Values.bootNode.service.p2pTcpPort }}"
{{- else }}
value: "$(POD_DNS_NAME):{{ .Values.bootNode.service.p2pTcpPort }}"
value: "$(POD_IP):{{ .Values.bootNode.service.p2pTcpPort }}"
{{- end }}
- name: P2P_UDP_ANNOUNCE_ADDR
{{- if .Values.bootNode.externalUdpHost }}
value: "{{ .Values.bootNode.externalUdpHost }}:{{ .Values.bootNode.service.p2pUdpPort }}"
{{- else }}
value: "$(POD_DNS_NAME):{{ .Values.bootNode.service.p2pUdpPort }}"
value: "$(POD_IP):{{ .Values.bootNode.service.p2pUdpPort }}"
{{- end }}
- name: P2P_TCP_LISTEN_ADDR
value: "0.0.0.0:{{ .Values.bootNode.service.p2pTcpPort }}"
Expand Down
12 changes: 3 additions & 9 deletions spartan/aztec-network/templates/prover-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,10 @@ spec:
- name: shared-volume
mountPath: /shared
env:
- name: POD_NAME
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_DNS_NAME
value: "$(POD_NAME).{{ include "aztec-network.fullname" . }}-prover-node.$(POD_NAMESPACE).svc.cluster.local"
fieldPath: status.podIP
- name: PORT
value: "{{ .Values.proverNode.service.nodePort }}"
- name: LOG_LEVEL
Expand All @@ -102,7 +96,7 @@ spec:
- name: PROVER_COORDINATION_NODE_URL
value: {{ include "aztec-network.bootNodeUrl" . | quote }}
- name: PROVER_JOB_SOURCE_URL
value: "http://$(POD_DNS_NAME):{{ .Values.proverNode.service.nodePort }}"
value: "http://$(POD_IP):{{ .Values.proverNode.service.nodePort }}"
ports:
- containerPort: {{ .Values.proverNode.service.nodePort }}
resources:
Expand Down
30 changes: 20 additions & 10 deletions spartan/aztec-network/templates/validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,30 @@ spec:
- "/bin/bash"
- "-c"
- "sleep 10 && source /shared/contracts.env && env && node --no-warnings /usr/src/yarn-project/aztec/dest/bin/index.js start --node --archiver --sequencer"
startupProbe:
httpGet:
path: /status
port: {{ .Values.validator.service.nodePort }}
failureThreshold: {{ .Values.validator.startupProbe.failureThreshold }}
periodSeconds: {{ .Values.validator.startupProbe.periodSeconds }}
livenessProbe:
exec:
command:
- /bin/sh
- -c
- curl -fSs http://127.0.0.1:{{ .Values.validator.service.nodePort }}/status
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 3
volumeMounts:
- name: shared-volume
mountPath: /shared
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_DNS_NAME
value: "$(POD_NAME).{{ include "aztec-network.fullname" . }}-validator.$(POD_NAMESPACE).svc.cluster.local"
fieldPath: status.podIP
- name: PORT
value: "{{ .Values.validator.service.nodePort }}"
- name: LOG_LEVEL
Expand All @@ -114,13 +124,13 @@ spec:
{{- if .Values.validator.externalTcpHost }}
value: "{{ .Values.validator.externalTcpHost }}:{{ .Values.validator.service.p2pTcpPort }}"
{{- else }}
value: "$(POD_DNS_NAME):{{ .Values.validator.service.p2pTcpPort }}"
value: "$(POD_IP):{{ .Values.validator.service.p2pTcpPort }}"
{{- end }}
- name: P2P_UDP_ANNOUNCE_ADDR
{{- if .Values.validator.externalUdpHost }}
value: "{{ .Values.validator.externalUdpHost }}:{{ .Values.validator.service.p2pUdpPort }}"
{{- else }}
value: "$(POD_DNS_NAME):{{ .Values.validator.service.p2pUdpPort }}"
value: "$(POD_IP):{{ .Values.validator.service.p2pUdpPort }}"
{{- end }}
- name: P2P_TCP_LISTEN_ADDR
value: "0.0.0.0:{{ .Values.validator.service.p2pTcpPort }}"
Expand Down
10 changes: 10 additions & 0 deletions spartan/aztec-network/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ bootNode:
memory: "2Gi"
cpu: "200m"
deployContracts: true # Set to false to use manual contract addresses
startupProbe:
periodSeconds: 10
# Only if we fail for 20 minutes straight do we call it botched
# This gives enough time to sync
failureThreshold: 120
contracts:
rollupAddress: ""
registryAddress: ""
Expand Down Expand Up @@ -75,6 +80,11 @@ validator:
disabled: false
p2p:
enabled: "true"
startupProbe:
periodSeconds: 10
# Only if we fail for 20 minutes straight do we call it botched
# This gives enough time to sync
failureThreshold: 120
resources:
requests:
memory: "2Gi"
Expand Down
3 changes: 1 addition & 2 deletions yarn-project/aztec/src/cli/cmds/start_node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ export const startNode = async (
options: any,
signalHandlers: (() => Promise<void>)[],
userLog: LogFn,
// ): Promise<ServerList> => {
) => {
): Promise<ServerList> => {
// Services that will be started in a single multi-rpc server
const services: ServerList = [];

Expand Down

0 comments on commit 1b85840

Please sign in to comment.