Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add customizeable liveness/readiness/startup probe endpoints #1363

Merged
merged 8 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ const _ = require('lodash');
const addRequestId = require('express-request-id')();
const {router, initialize} = require('./routes/index.js');
const log = require('./log').createLogger('razeedash-api/app/index');
const DefaultProbes = require('./utils/probes/probe-default.js');
const port = 3333;

// Set ipv4first (changed in Node 18)
Expand Down Expand Up @@ -151,6 +152,7 @@ function onListening() {
const addr = server.address();
const bind = typeof addr === 'string' ? `pipe ${addr}` : `port ${addr.port}`;
log.info(`🏄 razeedash-api listening on ${bind}/api`);
DefaultProbes.setStartupComplete(true);
}

function onError(error) {
Expand Down
48 changes: 25 additions & 23 deletions app/routes/kube/kube.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2019 IBM Corp. All Rights Reserved.
* Copyright 2019,2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,36 +15,38 @@
*/
const express = require('express');
const asyncHandler = require('express-async-handler');
const probeUtil = require('../../utils/probes');

const router = express.Router();
const { GraphqlPubSub } = require('../../apollo/subscription');
const pubSub = GraphqlPubSub.getInstance();
const logger = require('../../log').createLogger('razeedash-api/kube/liveness');
const timeInterval = 300000; //5 mintues

// /kube/liveness
router.get('/liveness', asyncHandler(async(req, res) => {
// does a db call to make sure we didnt disconnect
router.get('/startup', asyncHandler(async (req, res) => {
try {
await require('../../apollo/models').models.Organization.findOne({});
} catch (err) {
logger.error(err, 'razeedash-api liveness probe failed due to a mongo connection issue');
return res.sendStatus(503);
const payload = await probeUtil.getStartupPayload(req);
return res.status(200).send(payload);
}
catch (e) {
return res.status(503).send('service unavailable');
}
}));

// TODO: not real pub-sub liveness test yet, will add later
if (pubSub.initRetries > 5) {
// if the remote redis is not ready after 5 initial retries, then
// it is better to restart this pod, return 500 error
logger.error('Razeedash Api is down due to Redis pubsub connection issue, please check logs.');
return res.sendStatus(503);
router.get('/readiness', asyncHandler(async (req, res) => {
try {
const payload = await probeUtil.getReadinessPayload(req);
return res.status(200).send(payload);
}
catch (e) {
return res.status(503).send('service unavailable');
}
}));

if (pubSub.lastPubSubMessage !== null && Date.now()- pubSub.lastPubSubMessage.time > timeInterval) {
// check if the most recent message received is within ${timeInterval/60000} minitue
logger.error(`Razeedash Api is down, haven't received any published messages within ${timeInterval/60000} minitue, please check logs.`);
return res.sendStatus(503);
router.get('/liveness', asyncHandler(async(req, res) => {
try {
const payload = await probeUtil.getLivenessPayload(req);
return res.status(200).send(payload);
}
catch (e) {
return res.status(503).send('service unavailable');
}
return res.sendStatus(200);
}));

module.exports = router;
55 changes: 55 additions & 0 deletions app/utils/probes/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

const PROBE_DEFAULT_IMPL = require( './probe-default.js' );
const PROBE_CUSTOM_IMPL = require( process.env.PROBE_IMPL || './probe-none.js' );

/*
Return an impl for each of the probe types:
Get the default probe payload.
If default probe impl throws an error, throw an error.
If module specified by PROBE_IMPL implements a probe, get the custom probe payload.
If custom probe impl throws an error, throw an error.
Return the custom payload, or the default payload if there is none.
*/
const PROBE_IMPL = {
getStartupPayload: async function( context ) {
const method = 'getStartupPayload';
const defaultPayload = await PROBE_DEFAULT_IMPL[method](context);
if( !Object.prototype.hasOwnProperty.call(PROBE_CUSTOM_IMPL, method) ) {
return( PROBE_DEFAULT_IMPL[method](context) );
}
return defaultPayload;
},
getReadinessPayload: async function( context ) {
const method = 'getReadinessPayload';
const defaultPayload = await PROBE_DEFAULT_IMPL[method](context);
if( !Object.prototype.hasOwnProperty.call(PROBE_CUSTOM_IMPL, method) ) {
return( PROBE_DEFAULT_IMPL[method](context) );
}
return defaultPayload;
},
getLivenessPayload: async function( context ) {
const method = 'getLivenessPayload';
const defaultPayload = await PROBE_DEFAULT_IMPL[method](context);
if( !Object.prototype.hasOwnProperty.call(PROBE_CUSTOM_IMPL, method) ) {
return( PROBE_DEFAULT_IMPL[method](context) );
}
return defaultPayload;
}
};

module.exports = PROBE_IMPL;
60 changes: 60 additions & 0 deletions app/utils/probes/probe-default.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

const Models = require('../../apollo/models');
const { GraphqlPubSub } = require('../../apollo/subscription');
const pubSub = GraphqlPubSub.getInstance();
const timeInterval = 300000; //5 mintues

let STARTUP_COMPLETE = false;
async function getStartupPayload() {
if( !STARTUP_COMPLETE ) {
throw new Error('startup incomplete');
}
return('startup probe successful');
}

async function getReadinessPayload() {
return('readiness probe successful');
}

async function getLivenessPayload() {
// does a db call to make sure we didnt disconnect
try {
await Models.models.Organization.findOne({});
} catch (err) {
throw new Error(`Razeedash-api liveness probe failed due to a mongo connection issue: ${err.message}`);
}

// TODO: not real pub-sub liveness test yet, will add later
if (pubSub.initRetries > 5) {
// if the remote redis is not ready after 5 initial retries, then
// it is better to restart this pod, return 500 error
throw new Error('Razeedash-api liveness probe failed due to Redis pubsub connection issue, please check logs');
}

if (pubSub.lastPubSubMessage !== null && Date.now()- pubSub.lastPubSubMessage.time > timeInterval) {
// check if the most recent message received is within ${timeInterval/60000} minitue
throw new Error(`Razeedash-api is down, haven't received any published messages within ${timeInterval/60000} minutes, please check logs`);
}
}

// Called from app/index.js when server is ready to receive traffic
function setStartupComplete(b) {
STARTUP_COMPLETE = b;
}

module.exports = { getLivenessPayload, getReadinessPayload, getStartupPayload, setStartupComplete };
19 changes: 19 additions & 0 deletions app/utils/probes/probe-none.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// empty implementation to be used if PROBE_IMPL is not specified

module.exports = {};
65 changes: 65 additions & 0 deletions app/utils/probes/probe-sample.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/**
* Copyright 2024 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


/*
This sample shows how the startup/liveness/readiness probes can be customized by providing a
module that exports three functions:
- getStartupPayload
- getReadinessPayload
- getLivenessPayload

In each case, the function should return a payload string (not used by kubernetes, but can
be informative), or throw an error that explains why the probe should be failed.

In this sample:
- Return failure for startup probe for 60s, then success
- Return success for readiness probe for 5 minutes, then failure
- Always return success for liveness probe

To use this sample, `export PROBE_IMPL=./probe-sample` before starting the server.
*/

const START_TIME = Date.now();

async function getStartupPayload(req) {
const method = 'getStartupPayload';
req.log.warn( {req_id: req.id}, `${method} using SAMPLE implementation, should only happen during dev/test` );

if( Date.Now() - START_TIME < 60*1000 ) {
throw new Error('startup probe failing for first 60 seconds');
}
return('startup probe passing after 60 seconds');
}

async function getReadinessPayload(req) {
const method = 'getReadinessPayload';
req.log.warn( {req_id: req.id}, `${method} using SAMPLE implementation, should only happen during dev/test` );

if( Date.Now() - START_TIME < 5*60*1000 ) {
return('readiness probe passing for first 5 minutes');
}
throw new Error('readiness probe failing after 5 minutes');
}

async function getLivenessPayload(req) {
const method = 'getLivenessPayload';
req.log.warn( {req_id: req.id}, `${method} using SAMPLE implementation, should only happen during dev/test` );

return('liveness probe passing');
}

module.exports = { getLivenessPayload, getReadinessPayload, getStartupPayload };
11 changes: 8 additions & 3 deletions kubernetes/razeedash-api/resource.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,23 @@ items:
ports:
- containerPort: 3333
protocol: TCP
startupProbe:
httpGet:
path: /api/kube/startup
port: 3333
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /api/kube/liveness
port: 3333
initialDelaySeconds: 5
periodSeconds: 30
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /.well-known/apollo/server-health
path: /api/kube/readiness
port: 3333
initialDelaySeconds: 5
periodSeconds: 30
timeoutSeconds: 10
resources:
Expand Down
2 changes: 1 addition & 1 deletion locales/de/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Die Subskription für die Subskriptions-ID {{subscription_id}} konnte nicht gefunden werden.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Der Cluster mit der Cluster-ID {{cluster_id}} konnte nicht gefunden werden.",
"Could not locate the cluster with clusterName {{clusterName}}": "Der Cluster mit dem Clusternamen {{clusterName}} konnte nicht gefunden werden.",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "DeployableVersion konnte für {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}} nicht gefunden werden.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Publizieren der Ressourcenbenachrichtigung fehlgeschlagen. Bitte laden Sie die Seite erneut.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Publizieren der Ressourcenbenachrichtigung fehlgeschlagen. pubsub ist noch nicht bereit. Bitte versuchen Sie es später erneut.",
"Failed to Publish subscription notification to clusters, please retry.": "Publizieren der Subskriptionsbenachrichtigung an Cluster fehlgeschlagen. Bitte versuchen Sie es erneut.",
Expand Down
2 changes: 1 addition & 1 deletion locales/en/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Could not find the subscription for the subscription id {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Could not locate the cluster with cluster_id {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "Could not locate the cluster with clusterName {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Failed to Publish resource notification, please reload the page.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Failed to Publish resource notification, pubsub is not ready yet, please retry later.",
"Failed to Publish subscription notification to clusters, please retry.": "Failed to Publish subscription notification to clusters, please retry.",
Expand Down
2 changes: 1 addition & 1 deletion locales/es/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "No se ha podido encontrar la suscripción para el id de suscripción {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "No se ha podido localizar el clúster con ID_clúster {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "No se ha podido localizar el clúster con el nombre de clúster {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "No se ha encontrado DeployableVersion para {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "No se ha podido publicar la notificación del recurso, vuelva a cargar la página.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "No se ha podido publicar la notificación de recurso, pubsub aún no está preparado; vuelva a intentarlo más tarde.",
"Failed to Publish subscription notification to clusters, please retry.": "No se ha podido publicar la notificación de suscripción a los clústeres; vuelva a intentarlo.",
Expand Down
2 changes: 1 addition & 1 deletion locales/fr/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Impossible de trouver l'abonnement pour l'id d'abonnement {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Impossible de localiser le cluster cluster_id {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "Impossible de localiser le cluster clusterName {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "Version déployable introuvable pour {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Echec de publication de la notification de ressource, veuillez recharger la page.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Echec de publication de la notification de ressource, pubsub n'est pas prêt, veuillez réessayer plus tard.",
"Failed to Publish subscription notification to clusters, please retry.": "Echec de publication de la notification d'abonnement sur les clusters, veuillez réessayer,",
Expand Down
2 changes: 1 addition & 1 deletion locales/it/razee-resources.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"Could not find the subscription for the subscription id {{subscription_id}}.": "Impossibile trovare la sottoscrizione per l'ID sottoscrizione {{subscription_id}}.",
"Could not locate the cluster with cluster_id {{cluster_id}}": "Impossibile individuare il cluster con ID cluster {{cluster_id}}",
"Could not locate the cluster with clusterName {{clusterName}}": "Impossibile individuare il cluster con nome cluster {{clusterName}}",
"DeployableVersion is not found for {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.": "DeployableVersion non trovato per {{channel.name}}:{{channel.uuid}}/{{versionObj.name}}:{{versionObj.uuid}}.",
"DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.": "DeployableVersion is not found for {{channelName}}:{{channelUuid}}/{{versionName}}:{{versionUuid}}.",
"Failed to Publish resource notification, please reload the page.": "Impossibile pubblicare la notifica della risorsa, ricaricare la pagina.",
"Failed to Publish resource notification, pubsub is not ready yet, please retry later.": "Impossibile pubblicare la notifica della risorsa, pubsub non è ancora pronto. Riprovare in seguito.",
"Failed to Publish subscription notification to clusters, please retry.": "Impossibile pubblicare la notifica della sottoscrizione ai cluster, riprovare.",
Expand Down
Loading