Skip to content

Commit

Permalink
CA-394109: Reduce number of alerts
Browse files Browse the repository at this point in the history
Currently there are two ways of raising an alert when a cluster host
joins/leaves the cluster: 1. through the api call such as
cluster-host-leave; 2. through the cluster watcher which checks for
updates from `corosync-notifyd`. This will cause all alerts to be raised
twice.

This patch favours the second approach as it accounts for "unclean"
leave of a cluster host. Moreover, a "clean" leave triggered by an API
will cause a change in `corosync-notifyd` which will get detected by the
watcher anyway. This solves the double alerting problem.

Signed-off-by: Vincent Liu <shuntian.liu2@cloud.com>
  • Loading branch information
Vincent-lau committed Jun 18, 2024
1 parent 242e626 commit c0fe06a
Showing 1 changed file with 0 additions and 19 deletions.
19 changes: 0 additions & 19 deletions ocaml/xapi/xapi_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
*)

open Xapi_clustering
open Xapi_cluster_helpers
open Ipaddr_rpc_type

module D = Debug.Make (struct let name = "xapi_cluster_host" end)
Expand Down Expand Up @@ -55,20 +54,6 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body
raise err
)

let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts =
let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in
let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in
let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in
maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum

let alert_for_cluster_host_leave ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host]
~new_hosts:[]

let alert_for_cluster_host_join ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[]
~new_hosts:[cluster_host]

(* Create xapi db object for cluster_host, resync_host calls clusterd *)
let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
with_clustering_lock __LOC__ (fun () ->
Expand All @@ -81,7 +66,6 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
~enabled:false ~current_operations:[] ~allowed_operations:[]
~other_config:[] ~joined:false ~live:false
~last_update_live:API.Date.epoch ;
alert_for_cluster_host_join ~__context ~cluster_host:ref ;
ref
)

Expand Down Expand Up @@ -274,14 +258,12 @@ let destroy_op ~__context ~self ~force =
let result = local_fn (rpc ~__context) dbg in
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
| Ok () ->
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.%s was successful" fn_str ;
Xapi_clustering.Daemon.disable ~__context
| Error error ->
warn "Error occurred during Cluster_host.%s" fn_str ;
if force then (
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
let ref_str = Ref.string_of self in
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host %s force destroyed." ref_str
Expand Down Expand Up @@ -329,7 +311,6 @@ let forget ~__context ~self =
Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ;
(* must not disable the daemon here, because we declared another unreachable node dead,
* not the current one *)
alert_for_cluster_host_leave ~__context ~cluster_host:self ;
Db.Cluster_host.destroy ~__context ~self ;
debug "Cluster_host.forget was successful"
| Error error ->
Expand Down

0 comments on commit c0fe06a

Please sign in to comment.