Skip to content

Commit

Permalink
CA-395789: Add polling to cluster health state update
Browse files Browse the repository at this point in the history
It was observed that the corosync-notifyd might send a notification when
the data shown in corosync-quorumtool has not been updated. This causes
xapi-clusterd to return the out-of-date cluster information to xapi.
Xapi will not have a chance of updating these states again until there
is a further change in the cluster (which may not happen for a long
time).

Now add polling in xapi to update the cluster info every 5 minutes,
making sure that in the worst case, we still get up-to-date information
after a 5-minute delay.

Signed-off-by: Vincent Liu <shuntian.liu2@cloud.com>
  • Loading branch information
Vincent-lau committed Jul 22, 2024
1 parent 8337fa9 commit d9b18bd
Showing 1 changed file with 16 additions and 7 deletions.
23 changes: 16 additions & 7 deletions ocaml/xapi/xapi_clustering.ml
Original file line number Diff line number Diff line change
Expand Up @@ -535,10 +535,10 @@ module Watcher = struct

let cluster_change_watcher : bool Atomic.t = Atomic.make false

(* this is the time it takes for the update request to time out. It is ok to set
(* This is the time it takes for the update request to time out. It is ok to set
it to a relatively long value since the call will return immediately if there
is an update *)
let cluster_change_interval = Mtime.Span.min
is an update. *)
let cluster_change_interval = Mtime.Span.(5 * min)

let cluster_stack_watcher : bool Atomic.t = Atomic.make false

Expand All @@ -550,7 +550,7 @@ module Watcher = struct
while !Daemon.enabled do
let m =
Cluster_client.LocalClient.UPDATES.get (rpc ~__context)
"call cluster watcher"
"cluster change watcher call"
(Clock.Timer.span_to_s cluster_change_interval)
in
match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
Expand All @@ -562,9 +562,18 @@ module Watcher = struct
| None ->
()
)
| Error (InternalError "UPDATES.Timeout") ->
(* UPDATES.get timed out, this is normal, now retry *)
()
| Error (InternalError "UPDATES.Timeout") -> (
(* UPDATES.get timed out, this is normal. *)
match find_cluster_host ~__context ~host with
| Some ch ->
let cluster = Db.Cluster_host.get_cluster ~__context ~self:ch in
(* CA-395789: We send a query to xapi-clusterd to fetch the latest state
anyway in case there is a race and the previous update did not give the
most up-to-date information *)
on_corosync_update ~__context ~cluster ["routine updates"]
| None ->
()
)
| Error (InternalError message) | Error (Unix_error message) ->
warn "%s: Cannot query cluster host updates with error %s"
__FUNCTION__ message
Expand Down

0 comments on commit d9b18bd

Please sign in to comment.