CA-395789: Add polling to cluster health state update

It was observed that the corosync-notifyd might send a notification when the data shown in corosync-quorumtool has not been updated. This causes xapi-clusterd to return the out-of-date cluster information to xapi. Xapi will not have a chance of updating these states again until there is a further change in the cluster (which may not happen for a long time). Now add polling in xapi to update the cluster info every 5 minutes, making sure that in the worst case, we still get up-to-date information after a 5-minute delay. Signed-off-by: Vincent Liu <shuntian.liu2@cloud.com>
xapi-project · Jul 22, 2024 · d9b18bd · d9b18bd
1 parent 8337fa9
commit d9b18bd
Showing 1 changed file with 16 additions and 7 deletions.
diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml
@@ -535,10 +535,10 @@ module Watcher = struct
 
  let cluster_change_watcher : bool Atomic.t = Atomic.make false
 
- (* this is the time it takes for the update request to time out. It is ok to set
+ (* This is the time it takes for the update request to time out. It is ok to set
  it to a relatively long value since the call will return immediately if there
- is an update *)
- let cluster_change_interval = Mtime.Span.min
+ is an update. *)
+ let cluster_change_interval = Mtime.Span.(5 * min)
 
  let cluster_stack_watcher : bool Atomic.t = Atomic.make false
 
@@ -550,7 +550,7 @@ module Watcher = struct
  while !Daemon.enabled do
  let m =
  Cluster_client.LocalClient.UPDATES.get (rpc ~__context)
- "call cluster watcher"
+ "cluster change watcher call"
  (Clock.Timer.span_to_s cluster_change_interval)
  in
  match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
@@ -562,9 +562,18 @@ module Watcher = struct
  | None ->
  ()
  )
- | Error (InternalError "UPDATES.Timeout") ->
- (* UPDATES.get timed out, this is normal, now retry *)
- ()
+ | Error (InternalError "UPDATES.Timeout") -> (
+ (* UPDATES.get timed out, this is normal. *)
+ match find_cluster_host ~__context ~host with
+ | Some ch ->
+ let cluster = Db.Cluster_host.get_cluster ~__context ~self:ch in
+ (* CA-395789: We send a query to xapi-clusterd to fetch the latest state
+ anyway in case there is a race and the previous update did not give the
+ most up-to-date information *)
+ on_corosync_update ~__context ~cluster ["routine updates"]
+ | None ->
+ ()
+ )
  | Error (InternalError message) | Error (Unix_error message) ->
  warn "%s: Cannot query cluster host updates with error %s"
  __FUNCTION__ message