From 7886efdefa8e2cbdc0743aa5a06af2bbdfc5abb6 Mon Sep 17 00:00:00 2001 From: James Sumners Date: Tue, 17 Dec 2024 13:09:25 -0500 Subject: [PATCH] feat: Added New Relic Control health check --- lib/health-reporter.js | 168 ++++++++++++++++ package.json | 3 + test/unit/lib/health-reporter.test.js | 272 ++++++++++++++++++++++++++ 3 files changed, 443 insertions(+) create mode 100644 lib/health-reporter.js create mode 100644 test/unit/lib/health-reporter.test.js diff --git a/lib/health-reporter.js b/lib/health-reporter.js new file mode 100644 index 0000000000..fcba06fea6 --- /dev/null +++ b/lib/health-reporter.js @@ -0,0 +1,168 @@ +/* + * Copyright 2024 New Relic Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +'use strict' + +const fs = require('node:fs') +const crypto = require('node:crypto') +const path = require('node:path') + +const defaultLogger = require('./logger').child({ component: 'HealthReporter' }) + +const VALID_CODES = new Map([ + ['NR-APM-000', 'Healthy.'], + ['NR-APM-001', 'Invalid license key.'], + ['NR-APM-002', 'License key missing.'], + ['NR-APM-003', 'Forced disconnect received from New Relic.'], + ['NR-APM-004', 'HTTP error communicating with New Relic.'], + ['NR-APM-005', 'Missing application name in agent configuration.'], + ['NR-APM-006', 'The maximum number of configured app names is exceeded.'], + ['NR-APM-007', 'HTTP proxy is misconfigured.'], + ['NR-APM-008', 'Agent is disabled via configuration.'], + ['NR-APM-009', 'Failed to connect to the New Relic data collector.'], + ['NR-APM-010', 'Agent config could not be parsed.'], + ['NR-APM-099', 'Agent has shutdown.'] +]) + +function writeStatus({ file, healthy = true, code, msg, startTime, callback } = {}) { + const currentTime = Number(process.hrtime.bigint()) + const yaml = [ + `healthy: ${healthy}`, + `status: '${msg}'`, + `last_error: ${code}`, + `start_time_unix_nano: ${startTime}`, + `status_time_unix_nano: ${currentTime}` + ].join('\n') + fs.writeFile(file, yaml, { encoding: 'utf8' }, callback) +} + +class HealthReporter { + #status = HealthReporter.STATUS_HEALTHY + #interval + #destFile + #logger + #startTime + + static STATUS_HEALTHY = 'NR-APM-000' + static STATUS_INVALID_LICENSE_KEY = 'NR-APM-001' + static STATUS_LICENSE_KEY_MISSING = 'NR-APM-002' + static STATUS_FORCED_DISCONNECT = 'NR-APM-003' + static STATUS_BACKEND_ERROR = 'NR-APM-004' + static STATUS_MISSING_APP_NAME = 'NR-APM-005' + static STATUS_MAXIMUM_APP_NAMES_EXCEEDED = 'NR-APM-006' + static STATUS_HTTP_PROXY_MISCONFIGURED = 'NR-APM-007' + static STATUS_AGENT_DISABLED = 'NR-APM-008' + static STATUS_CONNECT_ERROR = 'NR-APM-009' + static STATUS_CONFIG_PARSE_FAILURE = 'NR-APM-010' + static STATUS_AGENT_SHUTDOWN = 'NR-APM-099' + + constructor({ logger = defaultLogger, setInterval = global.setInterval } = {}) { + const fleetId = process.env.NEW_RELIC_SUPERAGENT_FLEET_ID + const outDir = process.env.NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION + let checkInterval = process.env.NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY + + this.#logger = logger + + if (!fleetId) { + this.#logger.info('new relic control not present, skipping health reporting') + return + } + + if (outDir === undefined) { + this.#logger.error('health check output directory not provided, skipping health reporting') + return + } + + if (checkInterval === undefined) { + this.#logger.debug('health check interval not available, using default 5 seconds') + checkInterval = 5_000 + } else { + checkInterval = parseInt(checkInterval, 10) * 1_000 + } + + this.#startTime = Number(process.hrtime.bigint()) + + const uuid = crypto.randomUUID().replaceAll('-', '') + this.#destFile = path.join(outDir, `health-${uuid}.yaml`) + + this.#logger.info( + `new relic control is present, writing health on interval ${checkInterval} milliseconds to ${ + this.#destFile + }` + ) + this.#interval = setInterval(this.#healthCheck.bind(this), checkInterval) + this.#interval.unref() + + this.#logger.info('health reporter initialized') + } + + #healthCheck() { + const healthy = this.#status === HealthReporter.STATUS_HEALTHY + writeStatus({ + file: this.#destFile, + healthy, + startTime: this.#startTime, + code: this.#status, + msg: VALID_CODES.get(this.#status), + callback: (error) => { + if (error) { + this.#logger.error(`error when writing out health status: ${error.message}`) + } + } + }) + } + + setStatus(status) { + if (VALID_CODES.has(status) === false) { + // TODO: if we ever add codes in our reserved block (300-399), account for them here + this.#logger.warn(`invalid health reporter status provided: ${status}`) + return + } + + if ( + status === HealthReporter.STATUS_AGENT_SHUTDOWN && + this.#status !== HealthReporter.STATUS_HEALTHY + ) { + this.#logger.info( + `not setting shutdown health status due to current status code: ${this.#status}` + ) + return + } + + this.#status = status + } + + stop() { + clearInterval(this.#interval) + + const healthy = this.#status === HealthReporter.STATUS_HEALTHY + let code = this.#status + let msg = VALID_CODES.get(code) + if (healthy === true) { + // We only update the status on shutdown when the last known state is + // the healthy state. Otherwise, we need to leave the current code in + // place, and just update the report time. + code = HealthReporter.STATUS_AGENT_SHUTDOWN + msg = VALID_CODES.get(code) + } + + writeStatus({ + file: this.#destFile, + startTime: this.#startTime, + healthy, + code, + msg, + callback: (error) => { + if (error) { + this.#logger.error( + `error when writing out health status during shutdown: ${error.message}` + ) + } + } + }) + } +} + +module.exports = HealthReporter diff --git a/package.json b/package.json index 69fc90c850..90ddc3474b 100644 --- a/package.json +++ b/package.json @@ -193,6 +193,9 @@ "bin": { "newrelic-naming-rules": "./bin/test-naming-rules.js" }, + "imports": { + "#agentlib/*.js": "./lib/*.js" + }, "dependencies": { "@grpc/grpc-js": "^1.12.2", "@grpc/proto-loader": "^0.7.5", diff --git a/test/unit/lib/health-reporter.test.js b/test/unit/lib/health-reporter.test.js new file mode 100644 index 0000000000..f4c9235c4c --- /dev/null +++ b/test/unit/lib/health-reporter.test.js @@ -0,0 +1,272 @@ +/* + * Copyright 2024 New Relic Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +'use strict' + +const test = require('node:test') +const assert = require('node:assert') +const os = require('node:os') +const fs = require('node:fs') +const tspl = require('@matteo.collina/tspl') + +const match = require('../../lib/custom-assertions/match') + +// TODO: testing this out. Current eslint config doesn't allow for it. If +// it doesn't cause issues, then I'll investigate how to fix the suppression. +// eslint-disable-next-line node/no-missing-require +const HealthReporter = require('#agentlib/health-reporter.js') + +function simpleInterval(method) { + method.call() + return { + unref() {} + } +} + +test.beforeEach((ctx) => { + ctx.nr = {} + ctx.nr.writeFileOrig = fs.writeFile + ctx.nr.bigintOrig = process.hrtime.bigint + + let count = 0n + process.hrtime.bigint = () => { + count += 1n + return count + } + + const logs = { + info: [], + debug: [], + error: [], + warn: [] + } + ctx.nr.logs = logs + ctx.nr.logger = { + info(...args) { + logs.info.push(args) + }, + debug(...args) { + logs.debug.push(args) + }, + error(...args) { + logs.error.push(args) + }, + warn(...args) { + logs.warn.push(args) + } + } + + process.env.NEW_RELIC_SUPERAGENT_FLEET_ID = 42 + process.env.NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION = os.tmpdir() + process.env.NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY = 1 +}) + +test.afterEach((ctx) => { + fs.writeFile = ctx.nr.writeFileOrig + process.hrtime.bigint = ctx.nr.bigintOrig + delete process.env.NEW_RELIC_SUPERAGENT_FLEET_ID + delete process.env.NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION + delete process.env.NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY +}) + +test('requires fleet id to be set', (t) => { + delete process.env.NEW_RELIC_SUPERAGENT_FLEET_ID + + const reporter = new HealthReporter(t.nr) + assert.ok(reporter) + + const { + logs: { info } + } = t.nr + assert.deepStrictEqual(info, [['new relic control not present, skipping health reporting']]) +}) + +test('requires output directory to be set', (t) => { + delete process.env.NEW_RELIC_SUPERAGENT_HEALTH_DELIVERY_LOCATION + + const reporter = new HealthReporter(t.nr) + assert.ok(reporter) + + const { + logs: { info, error } + } = t.nr + assert.equal(info.length, 0, 'should not log any info messages') + assert.deepStrictEqual(error, [ + ['health check output directory not provided, skipping health reporting'] + ]) +}) + +test('sets default interval', (t) => { + delete process.env.NEW_RELIC_SUPERAGENT_HEALTH_FREQUENCY + + const reporter = new HealthReporter(t.nr) + assert.ok(reporter) + + const { + logs: { info, error, debug } + } = t.nr + match(info, [ + [/new relic control is present, writing health on interval 5000 milliseconds to .+/], + ['health reporter initialized'] + ]) + assert.equal(error.length, 0, 'should not log any errors') + assert.deepStrictEqual(debug, [['health check interval not available, using default 5 seconds']]) +}) + +test('initializes and writes to destination', async (t) => { + const plan = tspl(t, { plan: 8 }) + fs.writeFile = (dest, data, options, callback) => { + plan.match(dest, /health-\w{32}\.yaml/) + plan.equal( + data, + [ + 'healthy: true', + `status: 'Healthy.'`, + 'last_error: NR-APM-000', + 'start_time_unix_nano: 1', + 'status_time_unix_nano: 2' + ].join('\n') + ) + plan.deepStrictEqual(options, { encoding: 'utf8' }) + callback() + plan.equal(t.nr.logs.error.length, 0, 'callback should not write error log') + } + + const reporter = new HealthReporter({ ...t.nr, setInterval: localInterval }) + plan.ok(reporter) + + await plan.completed + + function localInterval(method, delay) { + plan.equal(delay, 1_000) + plan.equal(method.name, 'bound #healthCheck') + method.call() + return { + unref() { + plan.ok('invoked unref') + } + } + } +}) + +test('logs error if writing failed', async (t) => { + const plan = tspl(t, { plan: 3 }) + fs.writeFile = (dest, data, options, callback) => { + callback(Error('boom')) + plan.deepStrictEqual(t.nr.logs.error, [['error when writing out health status: boom']]) + } + + const reporter = new HealthReporter({ ...t.nr, setInterval: localInterval }) + plan.ok(reporter) + + await plan.completed + + function localInterval(method) { + method.call() + return { + unref() { + plan.ok('invoked unref') + } + } + } +}) + +test('setStatus warns for bad code', (t) => { + const reporter = new HealthReporter(t.nr) + reporter.setStatus('bad-code') + assert.deepStrictEqual(t.nr.logs.warn, [['invalid health reporter status provided: bad-code']]) +}) + +test('setStatus logs info message if shutdown and not healthy', (t) => { + const reporter = new HealthReporter(t.nr) + reporter.setStatus(HealthReporter.STATUS_BACKEND_ERROR) + reporter.setStatus(HealthReporter.STATUS_AGENT_SHUTDOWN) + assert.deepStrictEqual(t.nr.logs.info.pop(), [ + 'not setting shutdown health status due to current status code: NR-APM-004' + ]) +}) + +test('stop leaves last error code in place', async (t) => { + const plan = tspl(t, { plan: 3 }) + let invocation = 0 + fs.writeFile = (dest, data, options, callback) => { + if (invocation === 0) { + invocation += 1 + return callback() + } + + plan.equal( + data, + [ + 'healthy: false', + `status: 'HTTP error communicating with New Relic.'`, + 'last_error: NR-APM-004', + 'start_time_unix_nano: 1', + 'status_time_unix_nano: 3' + ].join('\n') + ) + callback() + plan.deepStrictEqual(t.nr.logs.error, []) + } + + const reporter = new HealthReporter({ ...t.nr, setInterval: simpleInterval }) + reporter.setStatus(HealthReporter.STATUS_BACKEND_ERROR) + reporter.stop() + plan.ok(reporter) + + await plan.completed +}) + +test('stop sets shutdown status', async (t) => { + const plan = tspl(t, { plan: 3 }) + let invocation = 0 + fs.writeFile = (dest, data, options, callback) => { + if (invocation === 0) { + invocation += 1 + return callback() + } + + plan.equal( + data, + [ + 'healthy: true', + `status: 'Agent has shutdown.'`, + 'last_error: NR-APM-099', + 'start_time_unix_nano: 1', + 'status_time_unix_nano: 3' + ].join('\n') + ) + callback() + plan.deepStrictEqual(t.nr.logs.error, []) + } + + const reporter = new HealthReporter({ ...t.nr, setInterval: simpleInterval }) + reporter.stop() + plan.ok(reporter) + + await plan.completed +}) + +test('stop logs writing error', async (t) => { + const plan = tspl(t, { plan: 2 }) + let invocation = 0 + fs.writeFile = (dest, data, options, callback) => { + if (invocation === 0) { + invocation += 1 + return callback() + } + + callback(Error('boom')) + plan.deepStrictEqual(t.nr.logs.error, [ + ['error when writing out health status during shutdown: boom'] + ]) + } + + const reporter = new HealthReporter({ ...t.nr, setInterval: simpleInterval }) + reporter.stop() + plan.ok(reporter) + + await plan.completed +})