Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PSP-7636 Prometheus/Sysdig alerts #3831

Merged
merged 9 commits into from
Mar 1, 2024
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ services:
- TZ=UTC
volumes:
- ./tools/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./tools/prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-lifecycle"

grafana:
image: grafana/grafana-oss
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,9 @@ spec:
users:
- name: emailforasr@gmail.com
role: ROLE_TEAM_EDIT
- name: devinleighsmith@gmail.com
role: ROLE_TEAM_EDIT
- name: marobej@gmail.com
role: ROLE_TEAM_EDIT
- name: eddherrera.code@gmail.com
role: ROLE_TEAM_EDIT
51 changes: 51 additions & 0 deletions source/backend/api/Helpers/Healthchecks/PimsMetricsHealthCheck.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using System.Data.Common;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Data.SqlClient;
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Prometheus;

namespace Pims.Api.Helpers.HealthChecks
{
public class PimsMetricsHealthCheck : IHealthCheck
{
private static readonly Gauge AppDeploymentInfo = Metrics.CreateGauge("api_deployment_info", "Deployment information of the running PSP application", labelNames: new[] { "app_version", "db_version", "runtime_version" });

public PimsMetricsHealthCheck(string connectionString)
{
ConnectionString = connectionString;
}

public string ConnectionString { get; }

public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default)
{
using (var connection = new SqlConnection(ConnectionString))
{
try
{
await connection.OpenAsync(cancellationToken);

if (connection.Database != null)
{
// Send various deployment metrics to prometheus as a custom metric: 'app_deployment_info'
var command = connection.CreateCommand();
command.CommandText = $"SELECT [STATIC_VARIABLE_VALUE] FROM [PIMS_STATIC_VARIABLE] WHERE [STATIC_VARIABLE_NAME] = N'DBVERSION';";

var dbVersion = (string)command.ExecuteScalar();
var appVersion = GetType().Assembly.GetName().Version.ToString();
var runtimeVersion = System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription;

AppDeploymentInfo.WithLabels(appVersion, dbVersion, runtimeVersion).Set(1.0);
}
}
catch (DbException ex)
{
return new HealthCheckResult(status: context.Registration.FailureStatus, exception: ex);
}
}

return HealthCheckResult.Healthy();
}
}
}
8 changes: 8 additions & 0 deletions source/backend/api/Startup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
using Pims.Api.Helpers;
using Pims.Api.Helpers.Exceptions;
using Pims.Api.Helpers.Healthchecks;
using Pims.Api.Helpers.HealthChecks;
using Pims.Api.Helpers.Logging;
using Pims.Api.Helpers.Mapping;
using Pims.Api.Helpers.Middleware;
Expand Down Expand Up @@ -244,6 +245,13 @@ public void ConfigureServices(IServiceCollection services)
HealthStatus.Unhealthy,
new string[] { "services" });

services.AddHealthChecks()
.AddCheck(
"api-metrics",
new PimsMetricsHealthCheck(csBuilder.ConnectionString),
HealthStatus.Unhealthy,
new string[] { "services" });

services.AddApiVersioning(options =>
{
options.ReportApiVersions = true;
Expand Down
12 changes: 12 additions & 0 deletions tools/prometheus/alert-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
groups:
- name: Hardware alerts
limit: 0
rules:
- alert: SqlServerDown
expr: aspnetcore_healthcheck_status{name="sqlserver"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: SQL Server down
description: "SQL server has been down on {{ $labels.instance }} for more than 1 minute."
12 changes: 7 additions & 5 deletions tools/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
global:
scrape_interval: 15s
scrape_interval: 10s
scrape_timeout: 10s
evaluation_interval: 15s
evaluation_interval: 10s
# Rules and alerts are read from the specified file(s)
rule_files:
- alert-rules.yml
alerting:
alertmanagers:
- scheme: http
Expand All @@ -10,10 +13,9 @@ alerting:
static_configs:
- targets: []
scrape_configs:
- job_name: TrackAPIUsage
- job_name: psp_api
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
scrape_interval: 10s
metrics_path: /metrics
scheme: http
static_configs:
Expand Down
Loading