(maint) Decrease healthcheck max wait to 5m

- Reduce both the start period by 2m and increase the retries by 1m to overall reduce wait time from 6m to 5m - This time only needs to be longer than Postgres, which has a healthcheck timeout defined in docker-compose (given its the stock container), but which typically starts under LCOW in about 2m Additional time allotted is to cover migrations - Restructure the scripted waiter to wait in the correct order for services (matching pe-puppetdb), and also to ensure that it roughly considers time already elapsed, rather than continually adding time. Even though SSL is not enabled here, the order is kept the same as pe-puppetdb for consistency: puppetserver postgres Once Postgres is up, this container can start!
voxpupuli · Oct 19, 2019 · e2c7220 · e2c7220
1 parent b9be170
commit e2c7220
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 15 deletions.
diff --git a/puppetdb/Dockerfile b/puppetdb/Dockerfile
@@ -94,11 +94,11 @@ CMD ["services"]
 
 COPY docker/puppetdb/healthcheck.sh /
 RUN chmod +x /healthcheck.sh
-# The start-period is just a wild guess how long it takes PuppetDB to come
-# up in the worst case. The other timing parameters are set so that it
-# takes at most a minute to realize that PuppetDB has failed.
+# The start-period describes how long it takes PuppetDB to come
+# up in the worst case (LCOW). The other timing parameters are set so that it
+# takes at most 2 minutes to realize that PuppetDB has failed.
 # Probe failure during --start-period will not be counted towards the maximum number of retries
-HEALTHCHECK --start-period=5m --interval=10s --timeout=10s --retries=6 CMD ["/healthcheck.sh"]
+HEALTHCHECK --start-period=3m --interval=10s --timeout=10s --retries=12 CMD ["/healthcheck.sh"]
 
 # VOLUME definitions are always at end of Dockerfile to address an LCOW bug
 # https://github.com/moby/moby/issues/39892

diff --git a/puppetdb/docker-entrypoint.d/20-wait-for-hosts.sh b/puppetdb/docker-entrypoint.d/20-wait-for-hosts.sh
@@ -4,10 +4,16 @@
 #
 #
 # Optional environment variables:
-#   PUPPETDB_WAITFORHOST_SECONDS     Number of seconds to wait for host, defaults to 30
-#   PUPPETDB_WAITFORPOSTGRES_SECONDS Number of seconds to wait on Postgres, defaults to 150
+#   PUPPETDB_WAITFORHOST_SECONDS     Number of seconds to wait for DNS names of
+#                                    Postgres and Puppetserver to resolve, defaults to 30
 #   PUPPETDB_WAITFORHEALTH_SECONDS   Number of seconds to wait for health
-#                                    checks of Consul / Puppetserver to succeed, defaults to 600
+#                                    checks of Consul / Puppetserver to succeed, defaults to 360
+#                                    to match puppetserver healthcheck max wait
+#   PUPPETDB_WAITFORPOSTGRES_SECONDS Additional number of seconds to wait on Postgres,
+#                                    after PuppetServer is healthy, defaults to 60
+#   PUPPETDB_POSTGRES_HOSTNAME       Specified in Dockerfile, defaults to postgres
+#   PUPPETSERVER_HOSTNAME            DNS name of puppetserver to wait on, defaults to puppet
+
 
 msg() {
     echo "($0) $1"
@@ -25,7 +31,7 @@ wait_for_host_name_resolution() {
   # k8s nodes may not be reachable with a ping
   # performing a dig prior to a host may help prime the cache in Alpine
   # https://github.com/Microsoft/opengcs/issues/303
-  /wtfc.sh --timeout=$PUPPETDB_WAITFORHOST_SECONDS --interval=1 --progress "dig $1 && host -t A $1"
+  /wtfc.sh --timeout="${2}" --interval=1 --progress "dig $1 && host -t A $1"
   # additionally log the DNS lookup information for diagnostic purposes
   NAME_RESOLVED=$?
   dig $1
@@ -36,30 +42,31 @@ wait_for_host_name_resolution() {
 
 wait_for_host_port() {
   # -v verbose -w connect / final net read timeout -z scan and don't send data
-  /wtfc.sh --timeout=${3:-$PUPPETDB_WAITFORHOST_SECONDS} --interval=1 --progress "nc -v -w 1 -z '${1}' ${2}"
+  /wtfc.sh --timeout=${3} --interval=1 --progress "nc -v -w 1 -z '${1}' ${2}"
   if [ $? -ne 0 ]; then
     error "host $1:$2 does not appear to be listening"
   fi
 }
 
 PUPPETDB_WAITFORHOST_SECONDS=${PUPPETDB_WAITFORHOST_SECONDS:-30}
-PUPPETDB_WAITFORPOSTGRES_SECONDS=${PUPPETDB_WAITFORPOSTGRES_SECONDS:-150}
-PUPPETDB_WAITFORHEALTH_SECONDS=${PUPPETDB_WAITFORHEALTH_SECONDS:-600}
+PUPPETDB_WAITFORPOSTGRES_SECONDS=${PUPPETDB_WAITFORPOSTGRES_SECONDS:-60}
+PUPPETDB_WAITFORHEALTH_SECONDS=${PUPPETDB_WAITFORHEALTH_SECONDS:-360}
 PUPPETDB_POSTGRES_HOSTNAME="${PUPPETDB_POSTGRES_HOSTNAME:-postgres}"
 PUPPETSERVER_HOSTNAME="${PUPPETSERVER_HOSTNAME:-puppet}"
 CONSUL_HOSTNAME="${CONSUL_HOSTNAME:-consul}"
 CONSUL_PORT="${CONSUL_PORT:-8500}"
 
-wait_for_host_name_resolution $PUPPETDB_POSTGRES_HOSTNAME
-wait_for_host_port $PUPPETDB_POSTGRES_HOSTNAME "${PUPPETDB_POSTGRES_PORT:-5432}" $PUPPETDB_WAITFORPOSTGRES_SECONDS
+# wait for postgres DNS
+wait_for_host_name_resolution $PUPPETDB_POSTGRES_HOSTNAME $PUPPETDB_WAITFORHOST_SECONDS
 
+# wait for consul / puppetserver DNS, then healthcheck
 if [ "$USE_PUPPETSERVER" = true ]; then
-  wait_for_host_name_resolution $PUPPETSERVER_HOSTNAME
+  wait_for_host_name_resolution $PUPPETSERVER_HOSTNAME $PUPPETDB_WAITFORHOST_SECONDS
   HEALTH_COMMAND="curl --silent --fail --insecure 'https://${PUPPETSERVER_HOSTNAME}:"${PUPPETSERVER_PORT:-8140}"/status/v1/simple' | grep -q '^running$'"
 fi
 
 if [ "$CONSUL_ENABLED" = "true" ]; then
-  wait_for_host_name_resolution $CONSUL_HOSTNAME
+  wait_for_host_name_resolution $CONSUL_HOSTNAME $PUPPETDB_WAITFORHOST_SECONDS
   # with Consul enabled, wait on Consul instead of Puppetserver
   HEALTH_COMMAND="curl --silent --fail 'http://${CONSUL_HOSTNAME}:${CONSUL_PORT}/v1/health/checks/puppet' | grep -q '\\"\""Status"\\\"": \\"\""passing\\"\""'"
 fi
@@ -70,3 +77,6 @@ if [ -n "$HEALTH_COMMAND" ]; then
     error "Required health check failed"
   fi
 fi
+
+# wait for postgres
+wait_for_host_port $PUPPETDB_POSTGRES_HOSTNAME "${PUPPETDB_POSTGRES_PORT:-5432}" $PUPPETDB_WAITFORPOSTGRES_SECONDS