Skip to content

Commit 87a1c59

Browse files
committed
xapi_ha: avoid raising Not_found when joining a liveset
Joining the liveset has been found to raise `Not_found` in some cases. Transform these exceptions to others that are more readable and show the exact cause. Signed-off-by: Pau Ruiz Safont <pau.safont@vates.tech>
1 parent 6ff5090 commit 87a1c59

File tree

2 files changed

+61
-51
lines changed

2 files changed

+61
-51
lines changed

ocaml/xapi/xapi_ha.ml

Lines changed: 60 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -100,33 +100,32 @@ let get_uuid_to_ip_mapping () =
100100
| None ->
101101
[]
102102

103+
let to_result ~none = function Some v -> Ok v | None -> Error none
104+
103105
(** Without using the Pool's database, returns the IP address of a particular host
104106
named by UUID. *)
105107
let address_of_host_uuid uuid =
106108
let table = get_uuid_to_ip_mapping () in
107-
if not (List.mem_assoc uuid table) then (
108-
error "Failed to find the IP address of host UUID %s" uuid ;
109-
raise Not_found
110-
) else
111-
List.assoc uuid table
109+
List.assoc_opt uuid table |> to_result ~none:Not_found
112110

113111
(** Without using the Pool's database, returns the UUID of a particular host named by
114112
heartbeat IP address. This is only necesary because the liveset info doesn't include
115113
the host IP address *)
116114
let uuid_of_host_address address =
117115
let table = List.map (fun (k, v) -> (v, k)) (get_uuid_to_ip_mapping ()) in
118-
match List.assoc_opt address table with
119-
| None ->
120-
error "Failed to find the UUID address of host with address %s" address ;
121-
raise Not_found
122-
| Some uuid_str -> (
123-
match Uuidx.of_string uuid_str with
124-
| None ->
125-
error "Failed parse UUID of host with address %s" address ;
126-
raise (Invalid_argument "Invalid UUID")
127-
| Some uuid ->
128-
uuid
129-
)
116+
let invalid_uuid = Invalid_argument "Invalid UUID" in
117+
let to_uuid str = Uuidx.of_string str |> to_result ~none:invalid_uuid in
118+
List.assoc_opt address table
119+
|> to_result ~none:Not_found
120+
|> Fun.flip Result.bind to_uuid
121+
122+
let ok_or_raise map_error = function Ok v -> v | Error exn -> map_error exn
123+
124+
let master_address_exn __FUN e =
125+
let exn = Printexc.to_string e in
126+
let msg = Printf.sprintf "unable to gather the coordinator's IP: %s" exn in
127+
error "%s: %s" __FUN msg ;
128+
raise Api_errors.(Server_error (internal_error, [msg]))
130129

131130
(** Called in two circumstances:
132131
1. When I started up I thought I was the master but my proposal was rejected by the
@@ -145,7 +144,9 @@ let on_master_failure () =
145144
done
146145
in
147146
let become_slave_of uuid =
148-
let address = address_of_host_uuid uuid in
147+
let address =
148+
address_of_host_uuid uuid |> ok_or_raise (master_address_exn __FUNCTION__)
149+
in
149150
info "This node will become the slave of host %s (%s)" uuid address ;
150151
Xapi_pool_transition.become_another_masters_slave address ;
151152
(* XXX CA-16388: prevent blocking *)
@@ -170,19 +171,17 @@ let on_master_failure () =
170171
"ha_can_not_be_master_on_next_boot set: I cannot be master; looking \
171172
for another master" ;
172173
let liveset = query_liveset () in
174+
let open Xha_interface.LiveSetInformation in
173175
match
174176
Hashtbl.fold
175177
(fun uuid host acc ->
176-
if
177-
host.Xha_interface.LiveSetInformation.Host.master
178-
&& host.Xha_interface.LiveSetInformation.Host.liveness
179-
(* CP-25481: a dead host may still have the master lock *)
180-
then
178+
(* CP-25481: a dead host may still have the master lock *)
179+
if host.Host.master && host.Host.liveness then
181180
uuid :: acc
182181
else
183182
acc
184183
)
185-
liveset.Xha_interface.LiveSetInformation.hosts []
184+
liveset.hosts []
186185
with
187186
| [] ->
188187
info "no other master exists yet; waiting 5 seconds and retrying" ;
@@ -197,6 +196,18 @@ let on_master_failure () =
197196
)
198197
done
199198

199+
let master_uuid_exn __FUN e =
200+
let exn = Printexc.to_string e in
201+
let msg = Printf.sprintf "unable to gather the coordinator's UUID: %s" exn in
202+
error "%s: %s" __FUN msg ;
203+
raise Api_errors.(Server_error (internal_error, [msg]))
204+
205+
let master_not_in_liveset_exn __FUN e =
206+
let exn = Printexc.to_string e in
207+
let msg = Printf.sprintf "unable to gather the coordinator's info: %s" exn in
208+
error "%s: %s" __FUN msg ;
209+
raise Api_errors.(Server_error (internal_error, [msg]))
210+
200211
module Timeouts = struct
201212
type t = {
202213
heart_beat_interval: int
@@ -463,16 +474,17 @@ module Monitor = struct
463474
(* WARNING: must not touch the database or perform blocking I/O *)
464475
let process_liveset_on_slave liveset =
465476
let address = Pool_role.get_master_address () in
466-
let master_uuid = uuid_of_host_address address in
477+
let master_uuid =
478+
uuid_of_host_address address
479+
|> ok_or_raise (master_uuid_exn __FUNCTION__)
480+
in
481+
let open Xha_interface.LiveSetInformation in
467482
let master_info =
468-
Hashtbl.find liveset.Xha_interface.LiveSetInformation.hosts
469-
master_uuid
483+
Hashtbl.find_opt liveset.hosts master_uuid
484+
|> to_result ~none:Not_found
485+
|> ok_or_raise (master_not_in_liveset_exn __FUNCTION__)
470486
in
471-
if
472-
true
473-
&& master_info.Xha_interface.LiveSetInformation.Host.liveness
474-
&& master_info.Xha_interface.LiveSetInformation.Host.master
475-
then
487+
if master_info.Host.liveness && master_info.Host.master then
476488
debug
477489
"The node we think is the master is still alive and marked \
478490
as master; this is OK"
@@ -1386,6 +1398,7 @@ let preconfigure_host __context localhost statevdis metadata_vdi generation =
13861398
Localdb.put Constants.ha_base_t (string_of_int base_t)
13871399

13881400
let join_liveset __context host =
1401+
let __FUN = __FUNCTION__ in
13891402
info "Host.ha_join_liveset host = %s" (Ref.string_of host) ;
13901403
ha_start_daemon () ;
13911404
Localdb.put Constants.ha_disable_failover_decisions "false" ;
@@ -1403,38 +1416,35 @@ let join_liveset __context host =
14031416
(* If this host is a slave then we must wait to confirm that the master manages to
14041417
assert itself, otherwise our monitoring thread might attempt a hostile takeover *)
14051418
let master_address = Pool_role.get_master_address () in
1406-
let master_uuid = uuid_of_host_address master_address in
1419+
let master_uuid =
1420+
uuid_of_host_address master_address
1421+
|> ok_or_raise (master_uuid_exn __FUN)
1422+
in
14071423
let master_found = ref false in
14081424
while not !master_found do
14091425
(* It takes a non-trivial amount of time for the master to assert itself: we might
14101426
as well wait here rather than enumerating all the if/then/else branches where we
14111427
should wait. *)
14121428
Thread.delay 5. ;
14131429
let liveset = query_liveset () in
1414-
debug "Liveset: %s"
1415-
(Xha_interface.LiveSetInformation.to_summary_string liveset) ;
1416-
if
1417-
liveset.Xha_interface.LiveSetInformation.status
1418-
= Xha_interface.LiveSetInformation.Status.Online
1419-
then
1430+
let open Xha_interface.LiveSetInformation in
1431+
debug "Liveset: %s" (to_summary_string liveset) ;
1432+
if liveset.status = Status.Online then
14201433
(* 'master' is the node we believe should become the xHA-level master initially *)
14211434
let master =
1422-
Hashtbl.find liveset.Xha_interface.LiveSetInformation.hosts
1423-
master_uuid
1435+
Hashtbl.find_opt liveset.hosts master_uuid
1436+
|> to_result ~none:Not_found
1437+
|> ok_or_raise (master_not_in_liveset_exn __FUN)
14241438
in
1425-
if master.Xha_interface.LiveSetInformation.Host.master then (
1439+
if master.Host.master then (
14261440
info "existing master has successfully asserted itself" ;
14271441
master_found := true (* loop will terminate *)
14281442
) else if
14291443
false
1430-
|| (not master.Xha_interface.LiveSetInformation.Host.liveness)
1431-
|| master
1432-
.Xha_interface.LiveSetInformation.Host.state_file_corrupted
1433-
|| (not
1434-
master
1435-
.Xha_interface.LiveSetInformation.Host.state_file_access
1436-
)
1437-
|| master.Xha_interface.LiveSetInformation.Host.excluded
1444+
|| (not master.Host.liveness)
1445+
|| master.Host.state_file_corrupted
1446+
|| (not master.Host.state_file_access)
1447+
|| master.Host.excluded
14381448
then (
14391449
error "Existing master has failed during HA enable process" ;
14401450
failwith "Existing master failed during HA enable process"

quality-gate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ unixgetenv () {
110110
}
111111

112112
hashtblfind () {
113-
N=35
113+
N=33
114114
# Looks for all .ml files except the ones using Core.Hashtbl.find,
115115
# which already returns Option
116116
HASHTBLFIND=$(git grep -P -r --count 'Hashtbl.find(?!_opt)' -- '**/*.ml' ':!ocaml/xapi-storage-script/main.ml' | cut -d ':' -f 2 | paste -sd+ - | bc)

0 commit comments

Comments
 (0)