Skip to content

Commit

Permalink
Merge pull request #5871 from Vincent-lau/private/shul2/health-poll
Browse files Browse the repository at this point in the history
CA-395789: Add polling to cluster health state update
  • Loading branch information
Vincent-lau authored Aug 2, 2024
2 parents 3c6d064 + 58a1420 commit 195f932
Showing 1 changed file with 24 additions and 13 deletions.
37 changes: 24 additions & 13 deletions ocaml/xapi/xapi_clustering.ml
Original file line number Diff line number Diff line change
Expand Up @@ -427,11 +427,16 @@ let compute_corosync_max_host_failures ~__context =
corosync_ha_max_hosts

module Watcher = struct
let routine_updates = "routine updates"

let on_corosync_update ~__context ~cluster updates =
debug
"%s: Received %d updates from corosync_notifyd, run diagnostics to get \
new state"
__FUNCTION__ (List.length updates) ;
if updates = [routine_updates] then
debug "%s: Perform routine updates" __FUNCTION__
else
debug
"%s: Received %d updates from corosync_notifyd, run diagnostics to get \
new state"
__FUNCTION__ (List.length updates) ;
let m =
Cluster_client.LocalClient.diagnostics (rpc ~__context)
"update quorum api fields with diagnostics"
Expand Down Expand Up @@ -535,10 +540,10 @@ module Watcher = struct

let cluster_change_watcher : bool Atomic.t = Atomic.make false

(* this is the time it takes for the update request to time out. It is ok to set
(* This is the time it takes for the update request to time out. It is ok to set
it to a relatively long value since the call will return immediately if there
is an update *)
let cluster_change_interval = Mtime.Span.min
is an update. *)
let cluster_change_interval = Mtime.Span.(5 * min)

let cluster_stack_watcher : bool Atomic.t = Atomic.make false

Expand All @@ -550,21 +555,27 @@ module Watcher = struct
while !Daemon.enabled do
let m =
Cluster_client.LocalClient.UPDATES.get (rpc ~__context)
"call cluster watcher"
"cluster change watcher call"
(Clock.Timer.span_to_s cluster_change_interval)
in
match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
| Ok updates -> (
let find_cluster_and_update updates =
match find_cluster_host ~__context ~host with
| Some ch ->
let cluster = Db.Cluster_host.get_cluster ~__context ~self:ch in
on_corosync_update ~__context ~cluster updates
| None ->
()
)
in
match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
| Ok updates ->
(* Received updates from corosync-notifyd *)
find_cluster_and_update updates
| Error (InternalError "UPDATES.Timeout") ->
(* UPDATES.get timed out, this is normal, now retry *)
()
(* UPDATES.get timed out, this is normal. *)
(* CA-395789: We send a query to xapi-clusterd to fetch the latest state
anyway in case there is a race and the previous update did not give the
most up-to-date information *)
find_cluster_and_update [routine_updates]
| Error (InternalError message) | Error (Unix_error message) ->
warn "%s: Cannot query cluster host updates with error %s"
__FUNCTION__ message
Expand Down

0 comments on commit 195f932

Please sign in to comment.