@@ -100,33 +100,32 @@ let get_uuid_to_ip_mapping () =
100100 | None ->
101101 []
102102
103+ let to_result ~none = function Some v -> Ok v | None -> Error none
104+
103105(* * Without using the Pool's database, returns the IP address of a particular host
104106 named by UUID. *)
105107let address_of_host_uuid uuid =
106108 let table = get_uuid_to_ip_mapping () in
107- if not (List. mem_assoc uuid table) then (
108- error " Failed to find the IP address of host UUID %s" uuid ;
109- raise Not_found
110- ) else
111- List. assoc uuid table
109+ List. assoc_opt uuid table |> to_result ~none: Not_found
112110
113111(* * Without using the Pool's database, returns the UUID of a particular host named by
114112 heartbeat IP address. This is only necesary because the liveset info doesn't include
115113 the host IP address *)
116114let uuid_of_host_address address =
117115 let table = List. map (fun (k , v ) -> (v, k)) (get_uuid_to_ip_mapping () ) in
118- match List. assoc_opt address table with
119- | None ->
120- error " Failed to find the UUID address of host with address %s" address ;
121- raise Not_found
122- | Some uuid_str -> (
123- match Uuidx. of_string uuid_str with
124- | None ->
125- error " Failed parse UUID of host with address %s" address ;
126- raise (Invalid_argument " Invalid UUID" )
127- | Some uuid ->
128- uuid
129- )
116+ let invalid_uuid = Invalid_argument " Invalid UUID" in
117+ let to_uuid str = Uuidx. of_string str |> to_result ~none: invalid_uuid in
118+ List. assoc_opt address table
119+ |> to_result ~none: Not_found
120+ |> Fun. flip Result. bind to_uuid
121+
122+ let ok_or_raise map_error = function Ok v -> v | Error exn -> map_error exn
123+
124+ let master_address_exn __FUN e =
125+ let exn = Printexc. to_string e in
126+ let msg = Printf. sprintf " unable to gather the coordinator's IP: %s" exn in
127+ error " %s: %s" __FUN msg ;
128+ raise Api_errors. (Server_error (internal_error, [msg]))
130129
131130(* * Called in two circumstances:
132131 1. When I started up I thought I was the master but my proposal was rejected by the
@@ -145,7 +144,9 @@ let on_master_failure () =
145144 done
146145 in
147146 let become_slave_of uuid =
148- let address = address_of_host_uuid uuid in
147+ let address =
148+ address_of_host_uuid uuid |> ok_or_raise (master_address_exn __FUNCTION__)
149+ in
149150 info " This node will become the slave of host %s (%s)" uuid address ;
150151 Xapi_pool_transition. become_another_masters_slave address ;
151152 (* XXX CA-16388: prevent blocking *)
@@ -170,19 +171,17 @@ let on_master_failure () =
170171 " ha_can_not_be_master_on_next_boot set: I cannot be master; looking \
171172 for another master" ;
172173 let liveset = query_liveset () in
174+ let open Xha_interface.LiveSetInformation in
173175 match
174176 Hashtbl. fold
175177 (fun uuid host acc ->
176- if
177- host.Xha_interface.LiveSetInformation.Host. master
178- && host.Xha_interface.LiveSetInformation.Host. liveness
179- (* CP-25481: a dead host may still have the master lock *)
180- then
178+ (* CP-25481: a dead host may still have the master lock *)
179+ if host.Host. master && host.Host. liveness then
181180 uuid :: acc
182181 else
183182 acc
184183 )
185- liveset.Xha_interface.LiveSetInformation. hosts []
184+ liveset.hosts []
186185 with
187186 | [] ->
188187 info " no other master exists yet; waiting 5 seconds and retrying" ;
@@ -197,6 +196,18 @@ let on_master_failure () =
197196 )
198197 done
199198
199+ let master_uuid_exn __FUN e =
200+ let exn = Printexc. to_string e in
201+ let msg = Printf. sprintf " unable to gather the coordinator's UUID: %s" exn in
202+ error " %s: %s" __FUN msg ;
203+ raise Api_errors. (Server_error (internal_error, [msg]))
204+
205+ let master_not_in_liveset_exn __FUN e =
206+ let exn = Printexc. to_string e in
207+ let msg = Printf. sprintf " unable to gather the coordinator's info: %s" exn in
208+ error " %s: %s" __FUN msg ;
209+ raise Api_errors. (Server_error (internal_error, [msg]))
210+
200211module Timeouts = struct
201212 type t = {
202213 heart_beat_interval : int
@@ -463,16 +474,17 @@ module Monitor = struct
463474 (* WARNING: must not touch the database or perform blocking I/O *)
464475 let process_liveset_on_slave liveset =
465476 let address = Pool_role. get_master_address () in
466- let master_uuid = uuid_of_host_address address in
477+ let master_uuid =
478+ uuid_of_host_address address
479+ |> ok_or_raise (master_uuid_exn __FUNCTION__)
480+ in
481+ let open Xha_interface.LiveSetInformation in
467482 let master_info =
468- Hashtbl. find liveset.Xha_interface.LiveSetInformation. hosts
469- master_uuid
483+ Hashtbl. find_opt liveset.hosts master_uuid
484+ |> to_result ~none: Not_found
485+ |> ok_or_raise (master_not_in_liveset_exn __FUNCTION__)
470486 in
471- if
472- true
473- && master_info.Xha_interface.LiveSetInformation.Host. liveness
474- && master_info.Xha_interface.LiveSetInformation.Host. master
475- then
487+ if master_info.Host. liveness && master_info.Host. master then
476488 debug
477489 " The node we think is the master is still alive and marked \
478490 as master; this is OK"
@@ -1386,6 +1398,7 @@ let preconfigure_host __context localhost statevdis metadata_vdi generation =
13861398 Localdb. put Constants. ha_base_t (string_of_int base_t)
13871399
13881400let join_liveset __context host =
1401+ let __FUN = __FUNCTION__ in
13891402 info " Host.ha_join_liveset host = %s" (Ref. string_of host) ;
13901403 ha_start_daemon () ;
13911404 Localdb. put Constants. ha_disable_failover_decisions " false" ;
@@ -1403,38 +1416,35 @@ let join_liveset __context host =
14031416 (* If this host is a slave then we must wait to confirm that the master manages to
14041417 assert itself, otherwise our monitoring thread might attempt a hostile takeover *)
14051418 let master_address = Pool_role. get_master_address () in
1406- let master_uuid = uuid_of_host_address master_address in
1419+ let master_uuid =
1420+ uuid_of_host_address master_address
1421+ |> ok_or_raise (master_uuid_exn __FUN)
1422+ in
14071423 let master_found = ref false in
14081424 while not ! master_found do
14091425 (* It takes a non-trivial amount of time for the master to assert itself: we might
14101426 as well wait here rather than enumerating all the if/then/else branches where we
14111427 should wait. *)
14121428 Thread. delay 5. ;
14131429 let liveset = query_liveset () in
1414- debug " Liveset: %s"
1415- (Xha_interface.LiveSetInformation. to_summary_string liveset) ;
1416- if
1417- liveset.Xha_interface.LiveSetInformation. status
1418- = Xha_interface.LiveSetInformation.Status. Online
1419- then
1430+ let open Xha_interface.LiveSetInformation in
1431+ debug " Liveset: %s" (to_summary_string liveset) ;
1432+ if liveset.status = Status. Online then
14201433 (* 'master' is the node we believe should become the xHA-level master initially *)
14211434 let master =
1422- Hashtbl. find liveset.Xha_interface.LiveSetInformation. hosts
1423- master_uuid
1435+ Hashtbl. find_opt liveset.hosts master_uuid
1436+ |> to_result ~none: Not_found
1437+ |> ok_or_raise (master_not_in_liveset_exn __FUN)
14241438 in
1425- if master.Xha_interface.LiveSetInformation. Host. master then (
1439+ if master.Host. master then (
14261440 info " existing master has successfully asserted itself" ;
14271441 master_found := true (* loop will terminate *)
14281442 ) else if
14291443 false
1430- || (not master.Xha_interface.LiveSetInformation.Host. liveness)
1431- || master
1432- .Xha_interface.LiveSetInformation.Host. state_file_corrupted
1433- || (not
1434- master
1435- .Xha_interface.LiveSetInformation.Host. state_file_access
1436- )
1437- || master.Xha_interface.LiveSetInformation.Host. excluded
1444+ || (not master.Host. liveness)
1445+ || master.Host. state_file_corrupted
1446+ || (not master.Host. state_file_access)
1447+ || master.Host. excluded
14381448 then (
14391449 error " Existing master has failed during HA enable process" ;
14401450 failwith " Existing master failed during HA enable process"
0 commit comments