99from  tests .integration .ha_tests .helpers  import  (
1010    METADATA ,
1111    app_name ,
12+     change_master_start_timeout ,
1213    count_writes ,
14+     fetch_cluster_members ,
15+     get_master_start_timeout ,
1316    get_primary ,
14-     kill_process ,
17+     is_replica ,
1518    postgresql_ready ,
1619    secondary_up_to_date ,
20+     send_signal_to_process ,
1721    start_continuous_writes ,
1822    stop_continuous_writes ,
1923)
@@ -52,8 +56,12 @@ async def test_kill_db_process(
5256    # Start an application that continuously writes data to the database. 
5357    await  start_continuous_writes (ops_test , app )
5458
59+     # Change the "master_start_timeout" parameter to speed up the fail-over. 
60+     original_master_start_timeout  =  await  get_master_start_timeout (ops_test )
61+     await  change_master_start_timeout (ops_test , 0 )
62+ 
5563    # Kill the database process. 
56-     await  kill_process (ops_test , primary_name , process , kill_code = "SIGKILL" )
64+     await  send_signal_to_process (ops_test , primary_name , process , kill_code = "SIGKILL" )
5765
5866    async  with  ops_test .fast_forward ():
5967        # Verify new writes are continuing by counting the number of writes before and after a 
@@ -72,6 +80,83 @@ async def test_kill_db_process(
7280    new_primary_name  =  await  get_primary (ops_test , app )
7381    assert  new_primary_name  !=  primary_name 
7482
83+     # Revert the "master_start_timeout" parameter to avoid fail-over again. 
84+     await  change_master_start_timeout (ops_test , original_master_start_timeout )
85+ 
86+     # Verify that the old primary is now a replica. 
87+     assert  is_replica (ops_test , primary_name ), "there are more than one primary in the cluster." 
88+ 
89+     # Verify that all units are part of the same cluster. 
90+     member_ips  =  await  fetch_cluster_members (ops_test )
91+     ip_addresses  =  [unit .public_address  for  unit  in  ops_test .model .applications [app ].units ]
92+     assert  set (member_ips ) ==  set (ip_addresses ), "not all units are part of the same cluster." 
93+ 
94+     # Verify that no writes to the database were missed after stopping the writes. 
95+     total_expected_writes  =  await  stop_continuous_writes (ops_test )
96+     for  attempt  in  Retrying (stop = stop_after_delay (60 ), wait = wait_fixed (3 )):
97+         with  attempt :
98+             actual_writes  =  await  count_writes (ops_test )
99+             assert  total_expected_writes  ==  actual_writes , "writes to the db were missed." 
100+ 
101+     # Verify that old primary is up-to-date. 
102+     assert  await  secondary_up_to_date (
103+         ops_test , primary_name , total_expected_writes 
104+     ), "secondary not up to date with the cluster after restarting." 
105+ 
106+ 
107+ @pytest .mark .ha_self_healing_tests  
108+ @pytest .mark .parametrize ("process" , DB_PROCESSES ) 
109+ async  def  test_freeze_db_process (
110+     ops_test : OpsTest , process : str , continuous_writes , master_start_timeout 
111+ ) ->  None :
112+     # Locate primary unit. 
113+     app  =  await  app_name (ops_test )
114+     primary_name  =  await  get_primary (ops_test , app )
115+ 
116+     # Start an application that continuously writes data to the database. 
117+     await  start_continuous_writes (ops_test , app )
118+ 
119+     # Change the "master_start_timeout" parameter to speed up the fail-over. 
120+     original_master_start_timeout  =  await  get_master_start_timeout (ops_test )
121+     await  change_master_start_timeout (ops_test , 0 )
122+ 
123+     # Freeze the database process. 
124+     await  send_signal_to_process (ops_test , primary_name , process , "SIGSTOP" )
125+ 
126+     async  with  ops_test .fast_forward ():
127+         # Verify new writes are continuing by counting the number of writes before and after a 
128+         # 3 minutes wait (this is a little more than the loop wait configuration, that is 
129+         # considered to trigger a fail-over after master_start_timeout is changed, and also 
130+         # when freezing the DB process it take some more time to trigger the fail-over). 
131+         writes  =  await  count_writes (ops_test , primary_name )
132+         for  attempt  in  Retrying (stop = stop_after_delay (60  *  3 ), wait = wait_fixed (3 )):
133+             with  attempt :
134+                 more_writes  =  await  count_writes (ops_test , primary_name )
135+                 assert  more_writes  >  writes , "writes not continuing to DB" 
136+ 
137+         # Verify that a new primary gets elected (ie old primary is secondary). 
138+         for  attempt  in  Retrying (stop = stop_after_delay (60  *  3 ), wait = wait_fixed (3 )):
139+             with  attempt :
140+                 new_primary_name  =  await  get_primary (ops_test , app )
141+                 assert  new_primary_name  !=  primary_name 
142+ 
143+         # Revert the "master_start_timeout" parameter to avoid fail-over again. 
144+         await  change_master_start_timeout (ops_test , original_master_start_timeout )
145+ 
146+         # Un-freeze the old primary. 
147+         await  send_signal_to_process (ops_test , primary_name , process , "SIGCONT" )
148+ 
149+         # Verify that the database service got restarted and is ready in the old primary. 
150+         assert  await  postgresql_ready (ops_test , primary_name )
151+ 
152+     # Verify that the old primary is now a replica. 
153+     assert  is_replica (ops_test , primary_name ), "there are more than one primary in the cluster." 
154+ 
155+     # Verify that all units are part of the same cluster. 
156+     member_ips  =  await  fetch_cluster_members (ops_test )
157+     ip_addresses  =  [unit .public_address  for  unit  in  ops_test .model .applications [app ].units ]
158+     assert  set (member_ips ) ==  set (ip_addresses ), "not all units are part of the same cluster." 
159+ 
75160    # Verify that no writes to the database were missed after stopping the writes. 
76161    total_expected_writes  =  await  stop_continuous_writes (ops_test )
77162    for  attempt  in  Retrying (stop = stop_after_delay (60 ), wait = wait_fixed (3 )):
0 commit comments