Skip to content

Commit 91e949e

Browse files
committed
xapi_ha: avoid raising Not_found when joining a liveset
Joining the liveset has been found to raise `Not_found` in some cases. Transform these exceptions to others that are more readable and show the exact cause. Signed-off-by: Pau Ruiz Safont <pau.safont@vates.tech>
1 parent a23bf25 commit 91e949e

File tree

2 files changed

+61
-51
lines changed

2 files changed

+61
-51
lines changed

ocaml/xapi/xapi_ha.ml

Lines changed: 60 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -100,33 +100,32 @@ let get_uuid_to_ip_mapping () =
100100
| None ->
101101
[]
102102

103+
let to_result ~none = function Some v -> Ok v | None -> Error none
104+
103105
(** Without using the Pool's database, returns the IP address of a particular host
104106
named by UUID. *)
105107
let address_of_host_uuid uuid =
106108
let table = get_uuid_to_ip_mapping () in
107-
if not (List.mem_assoc uuid table) then (
108-
error "Failed to find the IP address of host UUID %s" uuid ;
109-
raise Not_found
110-
) else
111-
List.assoc uuid table
109+
List.assoc_opt uuid table |> to_result ~none:Not_found
112110

113111
(** Without using the Pool's database, returns the UUID of a particular host named by
114112
heartbeat IP address. This is only necesary because the liveset info doesn't include
115113
the host IP address *)
116114
let uuid_of_host_address address =
117115
let table = List.map (fun (k, v) -> (v, k)) (get_uuid_to_ip_mapping ()) in
118-
match List.assoc_opt address table with
119-
| None ->
120-
error "Failed to find the UUID address of host with address %s" address ;
121-
raise Not_found
122-
| Some uuid_str -> (
123-
match Uuidx.of_string uuid_str with
124-
| None ->
125-
error "Failed parse UUID of host with address %s" address ;
126-
raise (Invalid_argument "Invalid UUID")
127-
| Some uuid ->
128-
uuid
129-
)
116+
let invalid_uuid = Invalid_argument "Invalid UUID" in
117+
let to_uuid str = Uuidx.of_string str |> to_result ~none:invalid_uuid in
118+
List.assoc_opt address table
119+
|> to_result ~none:Not_found
120+
|> Fun.flip Result.bind to_uuid
121+
122+
let ok_or_raise map_error = function Ok v -> v | Error exn -> map_error exn
123+
124+
let master_address_exn __FUN e =
125+
let exn = Printexc.to_string e in
126+
let msg = Printf.sprintf "unable to gather the coordinator's IP: %s" exn in
127+
error "%s: %s" __FUN msg ;
128+
raise Api_errors.(Server_error (internal_error, [msg]))
130129

131130
(** Called in two circumstances:
132131
1. When I started up I thought I was the master but my proposal was rejected by the
@@ -145,7 +144,9 @@ let on_master_failure () =
145144
done
146145
in
147146
let become_slave_of uuid =
148-
let address = address_of_host_uuid uuid in
147+
let address =
148+
address_of_host_uuid uuid |> ok_or_raise (master_address_exn __FUNCTION__)
149+
in
149150
info "This node will become the slave of host %s (%s)" uuid address ;
150151
Xapi_pool_transition.become_another_masters_slave address ;
151152
(* XXX CA-16388: prevent blocking *)
@@ -170,19 +171,17 @@ let on_master_failure () =
170171
"ha_can_not_be_master_on_next_boot set: I cannot be master; looking \
171172
for another master" ;
172173
let liveset = query_liveset () in
174+
let open Xha_interface.LiveSetInformation in
173175
match
174176
Hashtbl.fold
175177
(fun uuid host acc ->
176-
if
177-
host.Xha_interface.LiveSetInformation.Host.master
178-
&& host.Xha_interface.LiveSetInformation.Host.liveness
179-
(* CP-25481: a dead host may still have the master lock *)
180-
then
178+
(* CP-25481: a dead host may still have the master lock *)
179+
if host.Host.master && host.Host.liveness then
181180
uuid :: acc
182181
else
183182
acc
184183
)
185-
liveset.Xha_interface.LiveSetInformation.hosts []
184+
liveset.hosts []
186185
with
187186
| [] ->
188187
info "no other master exists yet; waiting 5 seconds and retrying" ;
@@ -197,6 +196,18 @@ let on_master_failure () =
197196
)
198197
done
199198

199+
let master_uuid_exn __FUN e =
200+
let exn = Printexc.to_string e in
201+
let msg = Printf.sprintf "unable to gather the coordinator's UUID: %s" exn in
202+
error "%s: %s" __FUN msg ;
203+
raise Api_errors.(Server_error (internal_error, [msg]))
204+
205+
let master_not_in_liveset_exn __FUN e =
206+
let exn = Printexc.to_string e in
207+
let msg = Printf.sprintf "unable to gather the coordinator's info: %s" exn in
208+
error "%s: %s" __FUN msg ;
209+
raise Api_errors.(Server_error (internal_error, [msg]))
210+
200211
module Timeouts = struct
201212
type t = {
202213
heart_beat_interval: int
@@ -463,16 +474,17 @@ module Monitor = struct
463474
(* WARNING: must not touch the database or perform blocking I/O *)
464475
let process_liveset_on_slave liveset =
465476
let address = Pool_role.get_master_address () in
466-
let master_uuid = uuid_of_host_address address in
477+
let master_uuid =
478+
uuid_of_host_address address
479+
|> ok_or_raise (master_uuid_exn __FUNCTION__)
480+
in
481+
let open Xha_interface.LiveSetInformation in
467482
let master_info =
468-
Hashtbl.find liveset.Xha_interface.LiveSetInformation.hosts
469-
master_uuid
483+
Hashtbl.find_opt liveset.hosts master_uuid
484+
|> to_result ~none:Not_found
485+
|> ok_or_raise (master_not_in_liveset_exn __FUNCTION__)
470486
in
471-
if
472-
true
473-
&& master_info.Xha_interface.LiveSetInformation.Host.liveness
474-
&& master_info.Xha_interface.LiveSetInformation.Host.master
475-
then
487+
if master_info.Host.liveness && master_info.Host.master then
476488
debug
477489
"The node we think is the master is still alive and marked \
478490
as master; this is OK"
@@ -1389,6 +1401,7 @@ let preconfigure_host __context localhost statevdis metadata_vdi generation =
13891401
Localdb.put Constants.ha_base_t (string_of_int base_t)
13901402

13911403
let join_liveset __context host =
1404+
let __FUN = __FUNCTION__ in
13921405
info "Host.ha_join_liveset host = %s" (Ref.string_of host) ;
13931406
ha_start_daemon () ;
13941407
Localdb.put Constants.ha_disable_failover_decisions "false" ;
@@ -1406,38 +1419,35 @@ let join_liveset __context host =
14061419
(* If this host is a slave then we must wait to confirm that the master manages to
14071420
assert itself, otherwise our monitoring thread might attempt a hostile takeover *)
14081421
let master_address = Pool_role.get_master_address () in
1409-
let master_uuid = uuid_of_host_address master_address in
1422+
let master_uuid =
1423+
uuid_of_host_address master_address
1424+
|> ok_or_raise (master_uuid_exn __FUN)
1425+
in
14101426
let master_found = ref false in
14111427
while not !master_found do
14121428
(* It takes a non-trivial amount of time for the master to assert itself: we might
14131429
as well wait here rather than enumerating all the if/then/else branches where we
14141430
should wait. *)
14151431
Thread.delay 5. ;
14161432
let liveset = query_liveset () in
1417-
debug "Liveset: %s"
1418-
(Xha_interface.LiveSetInformation.to_summary_string liveset) ;
1419-
if
1420-
liveset.Xha_interface.LiveSetInformation.status
1421-
= Xha_interface.LiveSetInformation.Status.Online
1422-
then
1433+
let open Xha_interface.LiveSetInformation in
1434+
debug "Liveset: %s" (to_summary_string liveset) ;
1435+
if liveset.status = Status.Online then
14231436
(* 'master' is the node we believe should become the xHA-level master initially *)
14241437
let master =
1425-
Hashtbl.find liveset.Xha_interface.LiveSetInformation.hosts
1426-
master_uuid
1438+
Hashtbl.find_opt liveset.hosts master_uuid
1439+
|> to_result ~none:Not_found
1440+
|> ok_or_raise (master_not_in_liveset_exn __FUN)
14271441
in
1428-
if master.Xha_interface.LiveSetInformation.Host.master then (
1442+
if master.Host.master then (
14291443
info "existing master has successfully asserted itself" ;
14301444
master_found := true (* loop will terminate *)
14311445
) else if
14321446
false
1433-
|| (not master.Xha_interface.LiveSetInformation.Host.liveness)
1434-
|| master
1435-
.Xha_interface.LiveSetInformation.Host.state_file_corrupted
1436-
|| (not
1437-
master
1438-
.Xha_interface.LiveSetInformation.Host.state_file_access
1439-
)
1440-
|| master.Xha_interface.LiveSetInformation.Host.excluded
1447+
|| (not master.Host.liveness)
1448+
|| master.Host.state_file_corrupted
1449+
|| (not master.Host.state_file_access)
1450+
|| master.Host.excluded
14411451
then (
14421452
error "Existing master has failed during HA enable process" ;
14431453
failwith "Existing master failed during HA enable process"

quality-gate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ unixgetenv () {
110110
}
111111

112112
hashtblfind () {
113-
N=35
113+
N=33
114114
# Looks for all .ml files except the ones using Core.Hashtbl.find,
115115
# which already returns Option
116116
HASHTBLFIND=$(git grep -P -r --count 'Hashtbl.find(?!_opt)' -- '**/*.ml' ':!ocaml/xapi-storage-script/main.ml' | cut -d ':' -f 2 | paste -sd+ - | bc)

0 commit comments

Comments
 (0)