Skip to content

Commit

Permalink
Feature/more reliable uptime calculation (#747)
Browse files Browse the repository at this point in the history
* New database table holding monitor run info

* SQL interface for new table

* Updated uptime calculation to instead rely on number of monitor test runs
  • Loading branch information
jstuczyn authored Aug 23, 2021
1 parent 2923d4b commit 15ae0f5
Show file tree
Hide file tree
Showing 6 changed files with 260 additions and 64 deletions.
7 changes: 7 additions & 0 deletions validator-api/migrations/20210819120000_monitor_runs.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
-- keeping track of all monitor runs that have happened will help to
-- solve an issue of mixnode being online only for a single check and yet being assigned 100% uptime
CREATE TABLE monitor_run
(
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
timestamp INTEGER NOT NULL
)
12 changes: 12 additions & 0 deletions validator-api/src/network_monitor/monitor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ impl Monitor {
// TODO: slightly more graceful shutdown here
process::exit(1);
}

// indicate our run has completed successfully and should be used in any future
// uptime calculations
if let Err(err) = self.node_status_storage.insert_monitor_run().await {
error!(
"Failed to submit monitor run information to the database - {}",
err
);

// TODO: slightly more graceful shutdown here
process::exit(1);
}
}

// checking it this way with a TestReport is rather suboptimal but given the fact we're only
Expand Down
25 changes: 21 additions & 4 deletions validator-api/src/node_status_api/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use rocket::http::{ContentType, Status};
use rocket::response::{self, Responder, Response};
use rocket::Request;
use serde::{Deserialize, Serialize};
use sqlx::types::time::OffsetDateTime;
use std::convert::TryFrom;
use std::fmt::{self, Display, Formatter};
use std::io::Cursor;
Expand Down Expand Up @@ -90,13 +91,21 @@ pub struct MixnodeStatusReport {

impl MixnodeStatusReport {
pub(crate) fn construct_from_last_day_reports(
report_time: OffsetDateTime,
identity: String,
owner: String,
last_day_ipv4: Vec<NodeStatus>,
last_day_ipv6: Vec<NodeStatus>,
last_hour_test_runs: usize,
last_day_test_runs: usize,
) -> Self {
let node_uptimes =
NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
report_time,
last_day_ipv4,
last_day_ipv6,
last_hour_test_runs,
last_day_test_runs,
);

MixnodeStatusReport {
identity,
Expand Down Expand Up @@ -128,13 +137,21 @@ pub struct GatewayStatusReport {

impl GatewayStatusReport {
pub(crate) fn construct_from_last_day_reports(
report_time: OffsetDateTime,
identity: String,
owner: String,
last_day_ipv4: Vec<NodeStatus>,
last_day_ipv6: Vec<NodeStatus>,
last_hour_test_runs: usize,
last_day_test_runs: usize,
) -> Self {
let node_uptimes =
NodeUptimes::calculate_from_last_day_reports(last_day_ipv4, last_day_ipv6);
let node_uptimes = NodeUptimes::calculate_from_last_day_reports(
report_time,
last_day_ipv4,
last_day_ipv6,
last_hour_test_runs,
last_day_test_runs,
);

GatewayStatusReport {
identity,
Expand Down
69 changes: 44 additions & 25 deletions validator-api/src/node_status_api/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
use crate::node_status_api::models::Uptime;
use crate::node_status_api::{FIFTEEN_MINUTES, ONE_HOUR};
use crate::storage::models::NodeStatus;
use log::warn;
use sqlx::types::time::OffsetDateTime;
use std::cmp::max;

// A temporary helper struct used to produce reports for active nodes.
pub(crate) struct ActiveNodeDayStatuses {
Expand All @@ -30,33 +32,23 @@ pub(crate) struct NodeUptimes {

impl NodeUptimes {
pub(crate) fn calculate_from_last_day_reports(
report_time: OffsetDateTime,
last_day_ipv4: Vec<NodeStatus>,
last_day_ipv6: Vec<NodeStatus>,
last_hour_test_runs: usize,
last_day_test_runs: usize,
) -> Self {
let now = OffsetDateTime::now_utc();
let hour_ago = (now - ONE_HOUR).unix_timestamp();
let fifteen_minutes_ago = (now - FIFTEEN_MINUTES).unix_timestamp();
let hour_ago = (report_time - ONE_HOUR).unix_timestamp();
let fifteen_minutes_ago = (report_time - FIFTEEN_MINUTES).unix_timestamp();

let ipv4_day_total = last_day_ipv4.len();
let ipv6_day_total = last_day_ipv6.len();
let mut ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
let mut ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();

let ipv4_day_up = last_day_ipv4.iter().filter(|report| report.up).count();
let ipv6_day_up = last_day_ipv6.iter().filter(|report| report.up).count();

let ipv4_hour_total = last_day_ipv4
.iter()
.filter(|report| report.timestamp >= hour_ago)
.count();
let ipv6_hour_total = last_day_ipv6
.iter()
.filter(|report| report.timestamp >= hour_ago)
.count();

let ipv4_hour_up = last_day_ipv4
let mut ipv4_hour_up = last_day_ipv4
.iter()
.filter(|report| report.up && report.timestamp >= hour_ago)
.count();
let ipv6_hour_up = last_day_ipv6
let mut ipv6_hour_up = last_day_ipv6
.iter()
.filter(|report| report.up && report.timestamp >= hour_ago)
.count();
Expand All @@ -73,15 +65,42 @@ impl NodeUptimes {
.map(|status| status.timestamp >= fifteen_minutes_ago && status.up) // make sure its within last 15min
.unwrap_or_default();

// the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results than all results in total
// because both of those values originate from the same vector
// If somehow we have more "up" reports than the actual test runs it means something weird is going on
// (or we just started running this code on old data, so if it appears for first 24h, it's fine and actually expected
// as we would not have any run information from the past)
// Either way, bound the the number of "up" reports by number of test runs and log warnings
// if that happens
if ipv4_hour_up > last_hour_test_runs || ipv6_hour_up > last_hour_test_runs {
warn!(
"We have more 'up' reports than the actual number of test runs in last hour! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
ipv4_hour_up,
ipv6_hour_up,
last_hour_test_runs,
);
ipv4_hour_up = max(ipv4_hour_up, last_hour_test_runs);
ipv6_hour_up = max(ipv6_hour_up, last_hour_test_runs);
}

if ipv4_day_up > last_day_test_runs || ipv6_day_up > last_day_test_runs {
warn!(
"We have more 'up' reports than the actual number of test runs in last day! ({} ipv4 'ups', {} ipv6 'ups' for {} test runs)",
ipv4_day_up,
ipv6_day_up,
last_day_test_runs,
);
ipv4_day_up = max(ipv4_day_up, last_day_test_runs);
ipv6_day_up = max(ipv6_day_up, last_day_test_runs);
}

// the unwraps in Uptime::from_ratio are fine because it's impossible for us to have more "up" results
// than total test runs as we just bounded them
NodeUptimes {
most_recent_ipv4,
most_recent_ipv6,
last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, ipv4_hour_total).unwrap(),
last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, ipv6_hour_total).unwrap(),
last_day_ipv4: Uptime::from_ratio(ipv4_day_up, ipv4_day_total).unwrap(),
last_day_ipv6: Uptime::from_ratio(ipv6_day_up, ipv6_day_total).unwrap(),
last_hour_ipv4: Uptime::from_ratio(ipv4_hour_up, last_hour_test_runs).unwrap(),
last_hour_ipv6: Uptime::from_ratio(ipv6_hour_up, last_hour_test_runs).unwrap(),
last_day_ipv4: Uptime::from_ratio(ipv4_day_up, last_day_test_runs).unwrap(),
last_day_ipv6: Uptime::from_ratio(ipv6_day_up, last_day_test_runs).unwrap(),
}
}
}
59 changes: 45 additions & 14 deletions validator-api/src/storage/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
use crate::network_monitor::monitor::summary_producer::NodeResult;
use crate::node_status_api::models::{HistoricalUptime, Uptime};
use crate::node_status_api::utils::ActiveNodeDayStatuses;
use crate::node_status_api::ONE_DAY;
use crate::storage::models::{ActiveNode, NodeStatus};
use crate::storage::UnixTimestamp;
use sqlx::types::time::OffsetDateTime;
use std::convert::TryFrom;

#[derive(Clone)]
Expand Down Expand Up @@ -463,6 +461,43 @@ impl StorageManager {
Ok(())
}

/// Creates a database entry for a finished network monitor test run.
///
/// # Arguments
///
/// * `timestamp`: unix timestamp at which the monitor test run has occurred
pub(crate) async fn insert_monitor_run(
&self,
timestamp: UnixTimestamp,
) -> Result<(), sqlx::Error> {
sqlx::query!("INSERT INTO monitor_run(timestamp) VALUES (?)", timestamp)
.execute(&self.connection_pool)
.await?;
Ok(())
}

/// Obtains number of network monitor test runs that have occurred within the specified interval.
///
/// # Arguments
///
/// * `since`: unix timestamp indicating the lower bound interval of the selection.
/// * `until`: unix timestamp indicating the upper bound interval of the selection.
pub(crate) async fn get_monitor_runs_count(
&self,
since: UnixTimestamp,
until: UnixTimestamp,
) -> Result<i32, sqlx::Error> {
let count = sqlx::query!(
"SELECT COUNT(*) as count FROM monitor_run WHERE timestamp > ? AND timestamp < ?",
since,
until,
)
.fetch_one(&self.connection_pool)
.await?
.count;
Ok(count)
}

pub(crate) async fn purge_old_mixnode_ipv4_statuses(
&self,
timestamp: UnixTimestamp,
Expand Down Expand Up @@ -579,19 +614,17 @@ impl StorageManager {
// since technically it doesn't touch any SQL directly
pub(crate) async fn get_all_active_mixnodes_statuses(
&self,
since: UnixTimestamp,
) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
let now = OffsetDateTime::now_utc();
let day_ago = (now - ONE_DAY).unix_timestamp();

let active_nodes = self.get_all_active_mixnodes(day_ago).await?;
let active_nodes = self.get_all_active_mixnodes(since).await?;

let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
for active_node in active_nodes.into_iter() {
let ipv4_statuses = self
.get_mixnode_ipv4_statuses_since_by_id(active_node.id, day_ago)
.get_mixnode_ipv4_statuses_since_by_id(active_node.id, since)
.await?;
let ipv6_statuses = self
.get_mixnode_ipv6_statuses_since_by_id(active_node.id, day_ago)
.get_mixnode_ipv6_statuses_since_by_id(active_node.id, since)
.await?;

let statuses = ActiveNodeDayStatuses {
Expand All @@ -614,19 +647,17 @@ impl StorageManager {
// since technically it doesn't touch any SQL directly
pub(crate) async fn get_all_active_gateways_statuses(
&self,
since: UnixTimestamp,
) -> Result<Vec<ActiveNodeDayStatuses>, sqlx::Error> {
let now = OffsetDateTime::now_utc();
let day_ago = (now - ONE_DAY).unix_timestamp();

let active_nodes = self.get_all_active_gateways(day_ago).await?;
let active_nodes = self.get_all_active_gateways(since).await?;

let mut active_day_statuses = Vec::with_capacity(active_nodes.len());
for active_node in active_nodes.into_iter() {
let ipv4_statuses = self
.get_gateway_ipv4_statuses_since_by_id(active_node.id, day_ago)
.get_gateway_ipv4_statuses_since_by_id(active_node.id, since)
.await?;
let ipv6_statuses = self
.get_gateway_ipv6_statuses_since_by_id(active_node.id, day_ago)
.get_gateway_ipv6_statuses_since_by_id(active_node.id, since)
.await?;

let statuses = ActiveNodeDayStatuses {
Expand Down
Loading

0 comments on commit 15ae0f5

Please sign in to comment.