From 63a21985b913eb7ffc6995d22b5bc78e3d21213a Mon Sep 17 00:00:00 2001 From: Alejandro Sanchez Date: Fri, 23 Feb 2024 13:38:06 -0800 Subject: [PATCH 1/4] Update sysdig team yaml with rest of devs --- openshift/4.0/templates/sysdig/sysdig-team.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/openshift/4.0/templates/sysdig/sysdig-team.yaml b/openshift/4.0/templates/sysdig/sysdig-team.yaml index 96295c88b1..0a6f7b4ed6 100644 --- a/openshift/4.0/templates/sysdig/sysdig-team.yaml +++ b/openshift/4.0/templates/sysdig/sysdig-team.yaml @@ -9,3 +9,9 @@ spec: users: - name: emailforasr@gmail.com role: ROLE_TEAM_EDIT + - name: devinleighsmith@gmail.com + role: ROLE_TEAM_EDIT + - name: marobej@gmail.com + role: ROLE_TEAM_EDIT + - name: eddherrera.code@gmail.com + role: ROLE_TEAM_EDIT From aa608d3115a4b5f0a2872518531949e571248a02 Mon Sep 17 00:00:00 2001 From: Alejandro Sanchez Date: Fri, 23 Feb 2024 14:58:57 -0800 Subject: [PATCH 2/4] Move sysdig config to monitoring folder --- openshift/4.0/templates/{sysdig => monitoring}/sysdig-team.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename openshift/4.0/templates/{sysdig => monitoring}/sysdig-team.yaml (100%) diff --git a/openshift/4.0/templates/sysdig/sysdig-team.yaml b/openshift/4.0/templates/monitoring/sysdig-team.yaml similarity index 100% rename from openshift/4.0/templates/sysdig/sysdig-team.yaml rename to openshift/4.0/templates/monitoring/sysdig-team.yaml From 15e20d870cc92bbcb4a54f3f17ab9d85977379c9 Mon Sep 17 00:00:00 2001 From: Alejandro Sanchez Date: Wed, 28 Feb 2024 15:56:53 -0800 Subject: [PATCH 3/4] Add a health check for the PIMS database via the PIMS API and Prometheus. --- .../Healthchecks/PimsMetricsHealthCheck.cs | 51 +++++++++++++++++++ source/backend/api/Startup.cs | 8 +++ 2 files changed, 59 insertions(+) create mode 100644 source/backend/api/Helpers/Healthchecks/PimsMetricsHealthCheck.cs diff --git a/source/backend/api/Helpers/Healthchecks/PimsMetricsHealthCheck.cs b/source/backend/api/Helpers/Healthchecks/PimsMetricsHealthCheck.cs new file mode 100644 index 0000000000..88f2531999 --- /dev/null +++ b/source/backend/api/Helpers/Healthchecks/PimsMetricsHealthCheck.cs @@ -0,0 +1,51 @@ +using System.Data.Common; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Data.SqlClient; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Prometheus; + +namespace Pims.Api.Helpers.HealthChecks +{ + public class PimsMetricsHealthCheck : IHealthCheck + { + private static readonly Gauge AppDeploymentInfo = Metrics.CreateGauge("api_deployment_info", "Deployment information of the running PSP application", labelNames: new[] { "app_version", "db_version", "runtime_version" }); + + public PimsMetricsHealthCheck(string connectionString) + { + ConnectionString = connectionString; + } + + public string ConnectionString { get; } + + public async Task CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) + { + using (var connection = new SqlConnection(ConnectionString)) + { + try + { + await connection.OpenAsync(cancellationToken); + + if (connection.Database != null) + { + // Send various deployment metrics to prometheus as a custom metric: 'app_deployment_info' + var command = connection.CreateCommand(); + command.CommandText = $"SELECT [STATIC_VARIABLE_VALUE] FROM [PIMS_STATIC_VARIABLE] WHERE [STATIC_VARIABLE_NAME] = N'DBVERSION';"; + + var dbVersion = (string)command.ExecuteScalar(); + var appVersion = GetType().Assembly.GetName().Version.ToString(); + var runtimeVersion = System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription; + + AppDeploymentInfo.WithLabels(appVersion, dbVersion, runtimeVersion).Set(1.0); + } + } + catch (DbException ex) + { + return new HealthCheckResult(status: context.Registration.FailureStatus, exception: ex); + } + } + + return HealthCheckResult.Healthy(); + } + } +} diff --git a/source/backend/api/Startup.cs b/source/backend/api/Startup.cs index 2d53949216..01eb2ebd16 100644 --- a/source/backend/api/Startup.cs +++ b/source/backend/api/Startup.cs @@ -32,6 +32,7 @@ using Pims.Api.Helpers; using Pims.Api.Helpers.Exceptions; using Pims.Api.Helpers.Healthchecks; +using Pims.Api.Helpers.HealthChecks; using Pims.Api.Helpers.Logging; using Pims.Api.Helpers.Mapping; using Pims.Api.Helpers.Middleware; @@ -244,6 +245,13 @@ public void ConfigureServices(IServiceCollection services) HealthStatus.Unhealthy, new string[] { "services" }); + services.AddHealthChecks() + .AddCheck( + "api-metrics", + new PimsMetricsHealthCheck(csBuilder.ConnectionString), + HealthStatus.Unhealthy, + new string[] { "services" }); + services.AddApiVersioning(options => { options.ReportApiVersions = true; From 34918057efea65261b982bcef2099f1745ccc39d Mon Sep 17 00:00:00 2001 From: Alejandro Sanchez Date: Wed, 28 Feb 2024 16:53:52 -0800 Subject: [PATCH 4/4] Update local prometheus configuration to support alerting when database server goes down --- docker-compose.yml | 4 ++++ tools/prometheus/alert-rules.yml | 12 ++++++++++++ tools/prometheus/prometheus.yml | 12 +++++++----- 3 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 tools/prometheus/alert-rules.yml diff --git a/docker-compose.yml b/docker-compose.yml index 47e5670dd2..ee2cea399b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -102,6 +102,10 @@ services: - TZ=UTC volumes: - ./tools/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./tools/prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--web.enable-lifecycle" grafana: image: grafana/grafana-oss diff --git a/tools/prometheus/alert-rules.yml b/tools/prometheus/alert-rules.yml new file mode 100644 index 0000000000..2a9d879904 --- /dev/null +++ b/tools/prometheus/alert-rules.yml @@ -0,0 +1,12 @@ +groups: + - name: Hardware alerts + limit: 0 + rules: + - alert: SqlServerDown + expr: aspnetcore_healthcheck_status{name="sqlserver"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: SQL Server down + description: "SQL server has been down on {{ $labels.instance }} for more than 1 minute." diff --git a/tools/prometheus/prometheus.yml b/tools/prometheus/prometheus.yml index 813f9a3d58..8d241c2191 100644 --- a/tools/prometheus/prometheus.yml +++ b/tools/prometheus/prometheus.yml @@ -1,7 +1,10 @@ global: - scrape_interval: 15s + scrape_interval: 10s scrape_timeout: 10s - evaluation_interval: 15s + evaluation_interval: 10s +# Rules and alerts are read from the specified file(s) +rule_files: + - alert-rules.yml alerting: alertmanagers: - scheme: http @@ -10,10 +13,9 @@ alerting: static_configs: - targets: [] scrape_configs: - - job_name: TrackAPIUsage + - job_name: psp_api honor_timestamps: true - scrape_interval: 15s - scrape_timeout: 10s + scrape_interval: 10s metrics_path: /metrics scheme: http static_configs: