From 112adfacfdd79d3cbc60bb4953f8969fcd2eed0c Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Wed, 3 Mar 2021 00:11:41 -0500 Subject: [PATCH 01/37] DAOS-6923 test: Offline Reintegration - More tests Test-tag-hw-medium: pr,hw,medium,ib2 offline_reintegration Summary: - Moved some more common files to osa_utils.py - Added the 200 pool test method - Test with different object class - More ranks excluded and reintegrated Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 128 +++++++++--------- .../ftest/osa/osa_offline_reintegration.yaml | 8 +- src/tests/ftest/util/osa_utils.py | 43 ++++++ 3 files changed, 112 insertions(+), 67 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 8201d35a8ca..f456a369ea1 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -11,12 +11,6 @@ from write_host_file import write_host_file from apricot import skipForTicket -try: - # python 3.x - import queue as queue -except ImportError: - # python 2.7 - import Queue as queue class OSAOfflineReintegration(OSAUtils): # pylint: disable=too-many-ancestors @@ -30,51 +24,18 @@ def setUp(self): """Set up for test case.""" super(OSAOfflineReintegration, self).setUp() self.dmg_command = self.get_dmg_command() - self.ior_w_flags = self.params.get("write_flags", '/run/ior/iorflags/*') - self.ior_r_flags = self.params.get("read_flags", '/run/ior/iorflags/*') self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get( "ior_test_sequence", '/run/ior/iorflags/*') self.ior_dfs_oclass = self.params.get( "obj_class", '/run/ior/iorflags/*') + self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) - self.out_queue = queue.Queue() - - def run_ior_thread(self, action, oclass, api, test): - """Start the IOR thread for either writing or - reading data to/from a container. - Args: - action (str): Start the IOR thread with Read or - Write - oclass (str): IOR object class - API (str): IOR API - test (list): IOR test sequence - flags (str): IOR flags - """ - if action == "Write": - flags = self.ior_w_flags - else: - flags = self.ior_r_flags - - # Add a thread for these IOR arguments - process = threading.Thread(target=self.ior_thread, - kwargs={"pool": self.pool, - "oclass": oclass, - "api": api, - "test": test, - "flags": flags, - "results": - self.out_queue}) - # Launch the IOR thread - process.start() - # Wait for the thread to finish - process.join() - def run_offline_reintegration_test(self, num_pool, data=False, - server_boot=False): + server_boot=False, oclass=None): """Run the offline reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. @@ -82,45 +43,44 @@ def run_offline_reintegration_test(self, num_pool, data=False, some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. Defults to False. + oclass (str) : daos object class string (eg: "RP_2G8") """ # Create a pool pool = {} pool_uuid = [] - exclude_servers = (len(self.hostlist_servers) * 2) - 1 - # Exclude rank : two ranks other than rank 0. - rank = random.randint(1, exclude_servers) + # Exclude ranks [0, 3, 4] + rank = [0, 3, 4] + if oclass is None: + oclass = self.ior_dfs_oclass[0] for val in range(0, num_pool): pool[val] = TestPool(self.context, dmg_command=self.get_dmg_command()) pool[val].get_params(self) - # Split total SCM and NVME size for creating multiple pools. - pool[val].scm_size.value = int(pool[val].scm_size.value / - num_pool) - pool[val].nvme_size.value = int(pool[val].nvme_size.value / - num_pool) pool[val].create() pool_uuid.append(pool[val].uuid) self.pool = pool[val] if data: - self.run_ior_thread("Write", self.ior_dfs_oclass[0], - self.ior_apis[0], self.ior_test_sequence[0]) - - # Exclude and reintegrate the pool_uuid, rank and targets - for val in range(0, num_pool): - self.pool = pool[val] + self.run_ior_thread("Write", oclass, + self.ior_apis[0], + self.ior_test_sequence[0]) + + # Exclude all the ranks + random_pool = random.randint(0, (num_pool-1)) + for val in range(len(rank)): + self.pool = pool[random_pool] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: output = self.dmg_command.pool_exclude(self.pool.uuid, - rank) + rank[val]) else: - output = self.dmg_command.system_stop(ranks=rank) + output = self.dmg_command.system_stop(ranks=rank[val]) self.pool.wait_for_rebuild(True) self.log.info(output) - output = self.dmg_command.system_start(ranks=rank) + output = self.dmg_command.system_start(ranks=rank[val]) self.log.info(output) self.is_rebuild_done(3) @@ -133,8 +93,15 @@ def run_offline_reintegration_test(self, num_pool, data=False, # pver_begin + 8 targets. self.assertTrue(pver_exclude > (pver_begin + 8), "Pool Version Error: After exclude") - output = self.dmg_command.pool_reintegrate(self.pool.uuid, - rank) + + # Reintegrate the ranks which was excluded + for val in range(0, len(rank)): + if val == 2: + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + rank[val], "0,2") + else: + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + rank[val]) self.log.info(output) self.is_rebuild_done(3) self.assert_on_rebuild_failure() @@ -145,13 +112,12 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.assertTrue(pver_reint > (pver_exclude + 1), "Pool Version Error: After reintegrate") - for val in range(0, num_pool): - display_string = "Pool{} space at the End".format(val) - self.pool = pool[val] - self.pool.display_pool_daos_space(display_string) + display_string = "Pool{} space at the End".format(random_pool) + self.pool = pool[random_pool] + self.pool.display_pool_daos_space(display_string) if data: - self.run_ior_thread("Read", self.ior_dfs_oclass[0], + self.run_ior_thread("Read", oclass, self.ior_apis[0], self.ior_test_sequence[0]) def test_osa_offline_reintegration(self): @@ -160,6 +126,7 @@ def test_osa_offline_reintegration(self): :avocado: tags=all,daily_regression,hw,medium,ib2 :avocado: tags=osa,offline_reintegration + :avocado: tags=offline_reintegration_exclude """ # Perform reintegration testing with a pool self.run_offline_reintegration_test(1, True) @@ -168,7 +135,36 @@ def test_osa_offline_reintegration(self): def test_osa_offline_reintegration_server_stop(self): """Test ID: DAOS-6748. Test Description: Validate Offline Reintegration with server stop - :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa + :avocado: tags=all,pr,daily_regression,hw,medium,ib2 + :avocado: tags=osa,offline_reintegration :avocado: tags=offline_reintegration_srv_stop """ self.run_offline_reintegration_test(1, data=True, server_boot=True) + + @skipForTicket("DAOS-6505") + def test_osa_offline_reintegration_200_pools(self): + """Test ID: DAOS-6923 + Test Description: Validate Offline Reintegration + with 200 pools + + :avocado: tags=all,full_regression,hw,medium,ib2 + :avocado: tags=osa,offline_reintegration + :avocado: tags=offline_reintegration_200 + """ + # Perform reintegration testing with a pool + self.run_offline_reintegration_test(200, True) + + def test_osa_offline_reintegration_oclass(self): + """Test ID: DAOS-6923 + Test Description: Validate Offline Reintegration + with different object class + + :avocado: tags=all,full_regression,hw,medium,ib2 + :avocado: tags=osa,offline_reintegration + :avocado: tags=offline_reintegration_oclass + """ + # Perform reintegration testing with a pool + for oclass in self.test_oclass: + self.run_offline_reintegration_test(1, data=True, + server_boot=False, + oclass=oclass) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index d426d879a65..49773c16732 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -77,4 +77,10 @@ ior: # - [scmsize, nvmesize, transfersize, blocksize] # The values are set to be in the multiples of 10. # Values are appx GB. - - [6000000000, 54000000000, 500000, 500000000] \ No newline at end of file + - [6000000000, 54000000000, 500000, 500000000] +test_obj_class: + oclass: + - RP_2G8 + - RP_3G6 + - EC_2P2G4 + - RP_4G1 \ No newline at end of file diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 9ee0b78ddcc..906bf719cc0 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -6,6 +6,7 @@ """ import ctypes import time +import threading from avocado import fail_on from ior_test_base import IorTestBase @@ -16,6 +17,14 @@ from pydaos.raw import (DaosContainer, IORequest, DaosObj, DaosApiError) +try: + # python 3.x + import queue as queue +except ImportError: + # python 2.7 + import Queue as queue + + class OSAUtils(IorTestBase): # pylint: disable=too-many-ancestors """ @@ -37,6 +46,10 @@ def setUp(self): default=[0])[0] self.record_length = self.params.get("length", '/run/record/*', default=[0])[0] + self.ior_w_flags = self.params.get("write_flags", '/run/ior/iorflags/*', + default="") + self.ior_r_flags = self.params.get("read_flags", '/run/ior/iorflags/*') + self.out_queue = queue.Queue() @fail_on(CommandFailure) def get_pool_leader(self): @@ -163,6 +176,36 @@ def verify_single_object(self): self.obj.close() self.container.close() + def run_ior_thread(self, action, oclass, api, test): + """Start the IOR thread for either writing or + reading data to/from a container. + Args: + action (str): Start the IOR thread with Read or + Write + oclass (str): IOR object class + API (str): IOR API + test (list): IOR test sequence + flags (str): IOR flags + """ + if action == "Write": + flags = self.ior_w_flags + else: + flags = self.ior_r_flags + + # Add a thread for these IOR arguments + process = threading.Thread(target=self.ior_thread, + kwargs={"pool": self.pool, + "oclass": oclass, + "api": api, + "test": test, + "flags": flags, + "results": + self.out_queue}) + # Launch the IOR thread + process.start() + # Wait for the thread to finish + process.join() + def ior_thread(self, pool, oclass, api, test, flags, results): """Start threads and wait until all threads are finished. From 9bc230256d1bef2792bd4bb2100a758169a22bb5 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Fri, 5 Mar 2021 17:33:55 -0500 Subject: [PATCH 02/37] DAOS-6923 test: Added mdtest feature Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_drain.py | 46 ++++++--- src/tests/ftest/osa/osa_offline_drain.yaml | 47 ++++++++- .../ftest/osa/osa_offline_reintegration.py | 96 ++++++++++++++----- .../ftest/osa/osa_offline_reintegration.yaml | 85 +++++++++------- src/tests/ftest/util/mdtest_test_base.py | 4 +- src/tests/ftest/util/osa_utils.py | 71 +++++++------- 6 files changed, 235 insertions(+), 114 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_drain.py b/src/tests/ftest/osa/osa_offline_drain.py index 25c9fb72827..746200e9595 100644 --- a/src/tests/ftest/osa/osa_offline_drain.py +++ b/src/tests/ftest/osa/osa_offline_drain.py @@ -5,9 +5,10 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import random +import time from osa_utils import OSAUtils from test_utils_pool import TestPool -from apricot import skipForTicket +from write_host_file import write_host_file class OSAOfflineDrain(OSAUtils): @@ -22,19 +23,29 @@ def setUp(self): """Set up for test case.""" super(OSAOfflineDrain, self).setUp() self.dmg_command = self.get_dmg_command() + self.ior_test_sequence = self.params.get( + "ior_test_sequence", '/run/ior/iorflags/*') + # Recreate the client hostfile without slots defined + self.hostfile_clients = write_host_file( + self.hostlist_clients, self.workdir, None) - def run_offline_drain_test(self, num_pool, data=False): + def run_offline_drain_test(self, num_pool, data=False, + oclass=None, drain_during_aggregation=False): """Run the offline drain without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. + oclass (str): DAOS object class (eg: RP_2G1,etc) + drain_during_aggregation (bool) : Perform drain and aggregation + in parallel """ # Create a pool pool = {} - pool_uuid = [] target_list = [] - drain_servers = (len(self.hostlist_servers) * 2) - 1 + + if oclass is None: + oclass = self.ior_cmd.dfs_oclass.value # Exclude target : random two targets (target idx : 0-7) n = random.randint(0, 6) @@ -42,8 +53,8 @@ def run_offline_drain_test(self, num_pool, data=False): target_list.append(n+1) t_string = "{},{}".format(target_list[0], target_list[1]) - # Drain a rank (or server) - rank = random.randint(1, drain_servers) + # Drain a rank 1 (or server) + rank = 1 for val in range(0, num_pool): pool[val] = TestPool(self.context, dmg_command=self.dmg_command) @@ -54,17 +65,27 @@ def run_offline_drain_test(self, num_pool, data=False): pool[val].nvme_size.value = int(pool[val].nvme_size.value / num_pool) pool[val].create() - pool_uuid.append(pool[val].uuid) self.pool = pool[val] + if drain_during_aggregation is True: + test_seq = self.ior_test_sequence[1] + self.pool.set_property("reclaim", "disabled") + else: + test_seq = self.ior_test_sequence[0] + if data: - self.write_single_object() + self.run_ior_thread("Write", oclass, test_seq) + self.run_mdtest_thread() - # Drain the pool_uuid, rank and targets + # Drain rank and targets for val in range(0, num_pool): self.pool = pool[val] + rank = rank + val self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) + if drain_during_aggregation is True: + self.pool.set_property("reclaim", "time") + time.sleep(90) output = self.dmg_command.pool_drain(self.pool.uuid, rank, t_string) self.log.info(output) @@ -82,9 +103,9 @@ def run_offline_drain_test(self, num_pool, data=False): pool[val].display_pool_daos_space(display_string) if data: - self.verify_single_object() + self.run_ior_thread("Read", oclass, test_seq) + self.run_mdtest_thread() - @skipForTicket("DAOS-6668") def test_osa_offline_drain(self): """ JIRA ID: DAOS-4750 @@ -94,5 +115,4 @@ def test_osa_offline_drain(self): :avocado: tags=all,daily_regression,hw,medium,ib2 :avocado: tags=osa,osa_drain,offline_drain """ - for pool_num in range(1, 3): - self.run_offline_drain_test(pool_num, True) + self.run_offline_drain_test(1, True) diff --git a/src/tests/ftest/osa/osa_offline_drain.yaml b/src/tests/ftest/osa/osa_offline_drain.yaml index c1ecf210a30..d8c6a1a52bd 100644 --- a/src/tests/ftest/osa/osa_offline_drain.yaml +++ b/src/tests/ftest/osa/osa_offline_drain.yaml @@ -48,8 +48,10 @@ pool: svcn: 4 control_method: dmg container: - properties: - enable_checksum: True + type: POSIX + control_method: daos + oclass: RP_2G1 + properties: cksum:crc64,cksum_size:16384,srv_cksum:on dkeys: single: no_of_dkeys: @@ -62,3 +64,44 @@ record: 1KB: length: - 1024 +ior: + clientslots: + slots: 48 + test_file: /testFile + repetitions: 1 + dfs_destroy: False + iorflags: + write_flags: "-w -F -k -G 1" + read_flags: "-F -r -R -k -G 1" + api: DFS + dfs_oclass: RP_2G1 + dfs_dir_oclass: RP_2G1 + ior_test_sequence: + # - [scmsize, nvmesize, transfersize, blocksize] + # The values are set to be in the multiples of 10. + # Values are appx GB. + - [6000000000, 54000000000, 500000, 500000000] + - [6000000000, 54000000000, 1000, 500000000] +mdtest: + api: DFS + client_processes: + np: 30 + num_of_files_dirs: 4067 # creating total of 120K files + test_dir: "/" + iteration: 1 + dfs_destroy: False + dfs_oclass: RP_2G1 + dfs_dir_oclass: RP_2G1 + manager: "MPICH" + flags: "-u" + wr_size: + 32K: + write_bytes: 32768 + read_bytes: 32768 + verbosity_value: 1 + depth: 0 +test_obj_class: + oclass: + - RP_2G8 + - RP_3G6 + - RP_4G1 diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index f456a369ea1..2fa108cddfb 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -5,7 +5,7 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import random -import threading +import time from osa_utils import OSAUtils from test_utils_pool import TestPool from write_host_file import write_host_file @@ -24,18 +24,17 @@ def setUp(self): """Set up for test case.""" super(OSAOfflineReintegration, self).setUp() self.dmg_command = self.get_dmg_command() - self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') self.ior_test_sequence = self.params.get( "ior_test_sequence", '/run/ior/iorflags/*') - self.ior_dfs_oclass = self.params.get( - "obj_class", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) def run_offline_reintegration_test(self, num_pool, data=False, - server_boot=False, oclass=None): + server_boot=False, oclass=None, + reint_during_rebuild=False, + reint_during_aggregation=False): """Run the offline reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. @@ -44,15 +43,20 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot (bool) : Perform system stop/start on a rank. Defults to False. oclass (str) : daos object class string (eg: "RP_2G8") + reint_during_rebuild (bool) : Perform reintegration during + rebuild (Defaults to False). + reint_during_aggregation (bool) : Perform reintegration + during aggregation + (Defaults to False). """ # Create a pool pool = {} pool_uuid = [] + if oclass is None: + oclass = self.ior_cmd.dfs_oclass.value # Exclude ranks [0, 3, 4] rank = [0, 3, 4] - if oclass is None: - oclass = self.ior_dfs_oclass[0] for val in range(0, num_pool): pool[val] = TestPool(self.context, @@ -61,19 +65,33 @@ def run_offline_reintegration_test(self, num_pool, data=False, pool[val].create() pool_uuid.append(pool[val].uuid) self.pool = pool[val] + if reint_during_aggregation is True: + test_seq = self.ior_test_sequence[1] + self.pool.set_property("reclaim", "disabled") + else: + test_seq = self.ior_test_sequence[0] if data: - self.run_ior_thread("Write", oclass, - self.ior_apis[0], - self.ior_test_sequence[0]) + self.run_ior_thread("Write", oclass, test_seq) + self.run_mdtest_thread() # Exclude all the ranks random_pool = random.randint(0, (num_pool-1)) - for val in range(len(rank)): + for val in range(0, len(rank)): self.pool = pool[random_pool] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: + if (reint_during_rebuild is True and val == 0): + # Exclude rank 5 + output = self.dmg_command.pool_exclude(self.pool.uuid, + "5") + self.log.info(output) + self.is_rebuild_done(3) + self.assert_on_rebuild_failure() + if reint_during_aggregation is True: + self.pool.set_property("reclaim", "time") + time.sleep(90) output = self.dmg_command.pool_exclude(self.pool.uuid, rank[val]) else: @@ -81,7 +99,12 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.pool.wait_for_rebuild(True) self.log.info(output) output = self.dmg_command.system_start(ranks=rank[val]) - + # Just try to reintegrate rank 5 + if (reint_during_rebuild is True and val == 2): + # Exclude rank 5 + time.sleep(3) + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + "5") self.log.info(output) self.is_rebuild_done(3) self.assert_on_rebuild_failure() @@ -96,7 +119,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, # Reintegrate the ranks which was excluded for val in range(0, len(rank)): - if val == 2: + if (val == 2 and "RP_2G" in oclass): output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank[val], "0,2") else: @@ -117,8 +140,8 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.pool.display_pool_daos_space(display_string) if data: - self.run_ior_thread("Read", oclass, - self.ior_apis[0], self.ior_test_sequence[0]) + self.run_ior_thread("Read", oclass, test_seq) + self.run_mdtest_thread() def test_osa_offline_reintegration(self): """Test ID: DAOS-4749 @@ -128,8 +151,7 @@ def test_osa_offline_reintegration(self): :avocado: tags=osa,offline_reintegration :avocado: tags=offline_reintegration_exclude """ - # Perform reintegration testing with a pool - self.run_offline_reintegration_test(1, True) + self.run_offline_reintegration_test(1, data=True) @skipForTicket("DAOS-6766, DAOS-6783") def test_osa_offline_reintegration_server_stop(self): @@ -141,19 +163,19 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) - @skipForTicket("DAOS-6505") - def test_osa_offline_reintegration_200_pools(self): + def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 - Test Description: Validate Offline Reintegration - with 200 pools + Test Description: Reintegrate rank while rebuild + is happening in parallel - :avocado: tags=all,full_regression,hw,medium,ib2 + :avocado: tags=all,daily_regression,hw,medium,ib2 :avocado: tags=osa,offline_reintegration - :avocado: tags=offline_reintegration_200 + :avocado: tags=offline_reintegrate_during_rebuild """ - # Perform reintegration testing with a pool - self.run_offline_reintegration_test(200, True) + self.run_offline_reintegration_test(1, data=True, + reint_during_rebuild=True) + @skipForTicket("DAOS-6905") def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration @@ -163,8 +185,30 @@ def test_osa_offline_reintegration_oclass(self): :avocado: tags=osa,offline_reintegration :avocado: tags=offline_reintegration_oclass """ - # Perform reintegration testing with a pool for oclass in self.test_oclass: self.run_offline_reintegration_test(1, data=True, server_boot=False, oclass=oclass) + + def test_osa_offline_reintegrate_during_aggregation(self): + """Test ID: DAOS-6923 + Test Description: Reintegrate rank while aggregation + is happening in parallel + + :avocado: tags=all,full_regression,hw,medium,ib2 + :avocado: tags=osa,offline_reintegration + :avocado: tags=offline_reintegrate_during_aggregation + """ + self.run_offline_reintegration_test(1, data=True, + reint_during_aggregation=True) + + @skipForTicket("DAOS-6505") + def test_osa_offline_reintegration_multiple_pools(self): + """Test ID: DAOS-6923 + Test Description: Validate Offline Reintegration + with multiple pools + + :avocado: tags=all,hw,medium,ib2,osa,offline_reintegration + :avocado: tags=offline_reintegration_multiple_pools + """ + self.run_offline_reintegration_test(200, data=True) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index 49773c16732..b29145ef6ad 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -47,40 +47,57 @@ dmg: transport_config: allow_insecure: True pool: - mode: 146 - name: daos_server - scm_size: 6000000000 - nvme_size: 54000000000 - svcn: 4 - control_method: dmg - rebuild_timeout: 120 - pool_query_timeout: 30 + mode: 146 + name: daos_server + scm_size: 6000000000 + nvme_size: 54000000000 + svcn: 4 + control_method: dmg + rebuild_timeout: 120 + pool_query_timeout: 30 container: - type: POSIX - control_method: daos - oclass: RP_2G1 - properties: cksum:crc64,cksum_size:16384,srv_cksum:on + type: POSIX + control_method: daos + oclass: RP_2G1 + properties: cksum:crc64,cksum_size:16384,srv_cksum:on,rf:1 ior: - clientslots: - slots: 48 - test_file: /testFile - repetitions: 1 - dfs_destroy: False - iorflags: - write_flags: "-w -F -k -G 1" - read_flags: "-F -r -R -k -G 1" - ior_api: - - DFS - obj_class: - - RP_2G1 - ior_test_sequence: - # - [scmsize, nvmesize, transfersize, blocksize] - # The values are set to be in the multiples of 10. - # Values are appx GB. - - [6000000000, 54000000000, 500000, 500000000] + clientslots: + slots: 48 + test_file: /testFile + repetitions: 2 + dfs_destroy: False + iorflags: + write_flags: "-w -F -k -G 1" + read_flags: "-F -r -R -k -G 1" + api: DFS + dfs_oclass: RP_2G1 + dfs_dir_oclass: RP_2G1 + ior_test_sequence: + # - [scmsize, nvmesize, transfersize, blocksize] + # The values are set to be in the multiples of 10. + # Values are appx GB. + - [6000000000, 54000000000, 500000, 500000000] + - [6000000000, 54000000000, 1000, 500000000] +mdtest: + api: DFS + client_processes: + np: 30 + num_of_files_dirs: 4067 # creating total of 120K files + test_dir: "/" + iteration: 1 + dfs_destroy: False + dfs_oclass: RP_2G1 + dfs_dir_oclass: RP_2G1 + manager: "MPICH" + flags: "-u" + wr_size: + 32K: + write_bytes: 32768 + read_bytes: 32768 + verbosity_value: 1 + depth: 0 test_obj_class: - oclass: - - RP_2G8 - - RP_3G6 - - EC_2P2G4 - - RP_4G1 \ No newline at end of file + oclass: + - RP_2G8 + - RP_3G6 + - RP_4G1 diff --git a/src/tests/ftest/util/mdtest_test_base.py b/src/tests/ftest/util/mdtest_test_base.py index a60ab73b27a..a0c6570c76c 100755 --- a/src/tests/ftest/util/mdtest_test_base.py +++ b/src/tests/ftest/util/mdtest_test_base.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): def setUp(self): """Set up each test case.""" # obtain separate logs - self.update_log_file_names() + self.update_log_file_names()g # Start the servers and agents super(MdtestBase, self).setUp() @@ -62,7 +62,7 @@ def execute_mdtest(self): self.run_mdtest(self.get_mdtest_job_manager_command(self.manager), self.processes) # reset self.container if dfs_destroy is True - if self.mdtest_cmd.dfs_destroy: + if self.mdtest_cmd.dfs_destroy is True: self.container = None self.stop_dfuse() diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 906bf719cc0..c4897951412 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -10,6 +10,7 @@ from avocado import fail_on from ior_test_base import IorTestBase +from mdtest_test_base import MdtestBase from command_utils import CommandFailure from ior_utils import IorCommand from job_manager_utils import Mpirun @@ -19,13 +20,13 @@ try: # python 3.x - import queue as queue + import queue as test_queue except ImportError: # python 2.7 - import Queue as queue + import Queue as test_queue -class OSAUtils(IorTestBase): +class OSAUtils(IorTestBase, MdtestBase): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs @@ -49,7 +50,8 @@ def setUp(self): self.ior_w_flags = self.params.get("write_flags", '/run/ior/iorflags/*', default="") self.ior_r_flags = self.params.get("read_flags", '/run/ior/iorflags/*') - self.out_queue = queue.Queue() + self.out_queue = test_queue.Queue() + self.dmg_command.exit_status_exception = False @fail_on(CommandFailure) def get_pool_leader(self): @@ -176,14 +178,13 @@ def verify_single_object(self): self.obj.close() self.container.close() - def run_ior_thread(self, action, oclass, api, test): + def run_ior_thread(self, action, oclass, test): """Start the IOR thread for either writing or reading data to/from a container. Args: action (str): Start the IOR thread with Read or Write oclass (str): IOR object class - API (str): IOR API test (list): IOR test sequence flags (str): IOR flags """ @@ -196,7 +197,6 @@ def run_ior_thread(self, action, oclass, api, test): process = threading.Thread(target=self.ior_thread, kwargs={"pool": self.pool, "oclass": oclass, - "api": api, "test": test, "flags": flags, "results": @@ -206,47 +206,44 @@ def run_ior_thread(self, action, oclass, api, test): # Wait for the thread to finish process.join() - def ior_thread(self, pool, oclass, api, test, flags, results): + def ior_thread(self, pool, oclass, test, flags, results): """Start threads and wait until all threads are finished. Args: pool (object): pool handle oclass (str): IOR object class - api (str): IOR api test (list): IOR test sequence flags (str): IOR flags results (queue): queue for returning thread results """ - mpio_util = MpioUtils() - if mpio_util.mpich_installed(self.hostlist_clients) is False: - self.fail("Exiting Test : Mpich not installed on :" - " {}".format(self.hostfile_clients[0])) self.pool = pool - # Define the arguments for the ior_runner_thread method - ior_cmd = IorCommand() - ior_cmd.get_params(self) - ior_cmd.set_daos_params(self.server_group, self.pool) - ior_cmd.dfs_oclass.update(oclass) - ior_cmd.dfs_dir_oclass.update(oclass) - ior_cmd.api.update(api) - ior_cmd.transfer_size.update(test[2]) - ior_cmd.block_size.update(test[3]) - ior_cmd.flags.update(flags) - - # Define the job manager for the IOR command - self.job_manager = Mpirun(ior_cmd, mpitype="mpich") + self.ior_cmd.get_params(self) + self.ior_cmd.set_daos_params(self.server_group, self.pool) + self.ior_cmd.dfs_oclass.update(oclass) + self.ior_cmd.dfs_dir_oclass.update(oclass) # Create container only if self.container is None: self.add_container(self.pool) - self.job_manager.job.dfs_cont.update(self.container.uuid) - env = ior_cmd.get_default_env(str(self.job_manager)) - self.job_manager.assign_hosts(self.hostlist_clients, self.workdir, None) - self.job_manager.assign_processes(self.processes) - self.job_manager.assign_environment(env, True) - - # run IOR Command - try: - self.job_manager.run() - except CommandFailure as _error: - results.put("FAIL") + job_manager = self.get_ior_job_manager_command() + job_manager.job.dfs_cont.update(self.container.uuid) + self.ior_cmd.transfer_size.update(test[2]) + self.ior_cmd.block_size.update(test[3]) + self.ior_cmd.flags.update(flags) + self.run_ior_with_pool(create_pool=False, create_cont=False) + + def run_mdtest_thread(self): + """Start mdtest thread and wait until thread completes. + """ + # Create container only + self.mdtest_cmd.dfs_destroy = False + if self.container is None: + self.add_container(self.pool) + job_manager = self.get_mdtest_job_manager_command(self.manager) + job_manager.job.dfs_cont.update(self.container.uuid) + # Add a thread for these IOR arguments + process = threading.Thread(target=self.execute_mdtest) + # Launch the MDtest thread + process.start() + # Wait for the thread to finish + process.join() From 7225a5b0b775b7bd189983142d9f1778a3fa1322 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Fri, 5 Mar 2021 17:58:44 -0500 Subject: [PATCH 03/37] DAOS-6923 test: Fix checkpatch issues. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 4 ++-- src/tests/ftest/util/mdtest_test_base.py | 2 +- src/tests/ftest/util/osa_utils.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 2fa108cddfb..6bd23596f6f 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -76,7 +76,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, # Exclude all the ranks random_pool = random.randint(0, (num_pool-1)) - for val in range(0, len(rank)): + for val, _ in enumerate(rank):: self.pool = pool[random_pool] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() @@ -118,7 +118,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, "Pool Version Error: After exclude") # Reintegrate the ranks which was excluded - for val in range(0, len(rank)): + for val, _ in enumerate(rank): if (val == 2 and "RP_2G" in oclass): output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank[val], "0,2") diff --git a/src/tests/ftest/util/mdtest_test_base.py b/src/tests/ftest/util/mdtest_test_base.py index a0c6570c76c..54e84d74106 100755 --- a/src/tests/ftest/util/mdtest_test_base.py +++ b/src/tests/ftest/util/mdtest_test_base.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): def setUp(self): """Set up each test case.""" # obtain separate logs - self.update_log_file_names()g + self.update_log_file_names() # Start the servers and agents super(MdtestBase, self).setUp() diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index c4897951412..06166850a2f 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -26,7 +26,7 @@ import Queue as test_queue -class OSAUtils(IorTestBase, MdtestBase): +class OSAUtils(MdtestBase, IorTestBase): # pylint: disable=too-many-ancestors """ Test Class Description: This test runs @@ -206,7 +206,7 @@ def run_ior_thread(self, action, oclass, test): # Wait for the thread to finish process.join() - def ior_thread(self, pool, oclass, test, flags, results): + def ior_thread(self, pool, oclass, test, flags): """Start threads and wait until all threads are finished. Args: @@ -231,7 +231,7 @@ def ior_thread(self, pool, oclass, test, flags, results): self.ior_cmd.block_size.update(test[3]) self.ior_cmd.flags.update(flags) self.run_ior_with_pool(create_pool=False, create_cont=False) - + def run_mdtest_thread(self): """Start mdtest thread and wait until thread completes. """ From 77f8c65bea7d76f448fa5a495bcc24eeee8d8c11 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Fri, 5 Mar 2021 18:03:29 -0500 Subject: [PATCH 04/37] DAOS-6923 test: Fix the typo Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 6bd23596f6f..115868c2dd3 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -76,7 +76,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, # Exclude all the ranks random_pool = random.randint(0, (num_pool-1)) - for val, _ in enumerate(rank):: + for val, _ in enumerate(rank): self.pool = pool[random_pool] self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() From b2cb8c6e4363aa8635ce81d47eb3e38efa762df8 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 7 Mar 2021 15:00:21 -0500 Subject: [PATCH 05/37] DAOS-6923 test: Removed unwanted results parameter Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 06166850a2f..d6ac08c719b 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -198,9 +198,7 @@ def run_ior_thread(self, action, oclass, test): kwargs={"pool": self.pool, "oclass": oclass, "test": test, - "flags": flags, - "results": - self.out_queue}) + "flags": flags) # Launch the IOR thread process.start() # Wait for the thread to finish @@ -214,7 +212,6 @@ def ior_thread(self, pool, oclass, test, flags): oclass (str): IOR object class test (list): IOR test sequence flags (str): IOR flags - results (queue): queue for returning thread results """ self.pool = pool From 078065e4d028c0f47b560b81aae594432e166be3 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 7 Mar 2021 20:48:58 -0500 Subject: [PATCH 06/37] DAOS-6923 test: Address checkpatch issues. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 4 +--- src/tests/ftest/util/osa_utils.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 115868c2dd3..2041dbbe2bc 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -51,19 +51,17 @@ def run_offline_reintegration_test(self, num_pool, data=False, """ # Create a pool pool = {} - pool_uuid = [] + random_pool = 0 if oclass is None: oclass = self.ior_cmd.dfs_oclass.value # Exclude ranks [0, 3, 4] rank = [0, 3, 4] - for val in range(0, num_pool): pool[val] = TestPool(self.context, dmg_command=self.get_dmg_command()) pool[val].get_params(self) pool[val].create() - pool_uuid.append(pool[val].uuid) self.pool = pool[val] if reint_during_aggregation is True: test_seq = self.ior_test_sequence[1] diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index d6ac08c719b..1f31a089be7 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -198,7 +198,7 @@ def run_ior_thread(self, action, oclass, test): kwargs={"pool": self.pool, "oclass": oclass, "test": test, - "flags": flags) + "flags": flags}) # Launch the IOR thread process.start() # Wait for the thread to finish From c072ccb813a43bb8ba7d4d1db43679fa7b1b53b0 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Tue, 9 Mar 2021 00:25:06 -0500 Subject: [PATCH 07/37] DAOS-6923 test: Run all the tests once now. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 134 ++++++++++-------- .../ftest/osa/osa_offline_reintegration.yaml | 4 +- 2 files changed, 77 insertions(+), 61 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 2041dbbe2bc..a3c78c7a115 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -9,7 +9,7 @@ from osa_utils import OSAUtils from test_utils_pool import TestPool from write_host_file import write_host_file -from apricot import skipForTicket +# from apricot import skipForTicket class OSAOfflineReintegration(OSAUtils): @@ -27,9 +27,12 @@ def setUp(self): self.ior_test_sequence = self.params.get( "ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') + self.loop_test_cnt = self.params.get("iterations", + '/run/loop_test/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) + self.dmg_command.exit_status_exception = False def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None, @@ -74,68 +77,71 @@ def run_offline_reintegration_test(self, num_pool, data=False, # Exclude all the ranks random_pool = random.randint(0, (num_pool-1)) - for val, _ in enumerate(rank): - self.pool = pool[random_pool] - self.pool.display_pool_daos_space("Pool space: Beginning") - pver_begin = self.get_pool_version() - self.log.info("Pool Version at the beginning %s", pver_begin) - if server_boot is False: - if (reint_during_rebuild is True and val == 0): - # Exclude rank 5 + for _ in range(0, self.loop_test_cnt): + for val, _ in enumerate(rank): + self.pool = pool[random_pool] + self.pool.display_pool_daos_space("Pool space: Beginning") + pver_begin = self.get_pool_version() + self.log.info("Pool Version at the beginning %s", pver_begin) + if server_boot is False: + if (reint_during_rebuild is True and val == 0): + # Exclude rank 5 + output = self.dmg_command.pool_exclude(self.pool.uuid, + "5") + self.log.info(output) + self.is_rebuild_done(3) + self.assert_on_rebuild_failure() + if reint_during_aggregation is True: + self.pool.set_property("reclaim", "time") + time.sleep(90) output = self.dmg_command.pool_exclude(self.pool.uuid, - "5") + rank[val]) + else: + output = self.dmg_command.system_stop(ranks=rank[val]) + self.pool.wait_for_rebuild(True) self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() - if reint_during_aggregation is True: - self.pool.set_property("reclaim", "time") - time.sleep(90) - output = self.dmg_command.pool_exclude(self.pool.uuid, - rank[val]) - else: - output = self.dmg_command.system_stop(ranks=rank[val]) - self.pool.wait_for_rebuild(True) + output = self.dmg_command.system_start(ranks=rank[val]) + # Just try to reintegrate rank 5 + if (reint_during_rebuild is True and val == 2): + # Exclude rank 5 + time.sleep(3) + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + "5") self.log.info(output) - output = self.dmg_command.system_start(ranks=rank[val]) - # Just try to reintegrate rank 5 - if (reint_during_rebuild is True and val == 2): - # Exclude rank 5 - time.sleep(3) - output = self.dmg_command.pool_reintegrate(self.pool.uuid, - "5") - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() - - pver_exclude = self.get_pool_version() - self.log.info("Pool Version after exclude %s", pver_exclude) - # Check pool version incremented after pool exclude - # pver_exclude should be greater than - # pver_begin + 8 targets. - self.assertTrue(pver_exclude > (pver_begin + 8), - "Pool Version Error: After exclude") - - # Reintegrate the ranks which was excluded - for val, _ in enumerate(rank): - if (val == 2 and "RP_2G" in oclass): - output = self.dmg_command.pool_reintegrate(self.pool.uuid, - rank[val], "0,2") - else: - output = self.dmg_command.pool_reintegrate(self.pool.uuid, - rank[val]) - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() + self.is_rebuild_done(3) + self.assert_on_rebuild_failure() + + pver_exclude = self.get_pool_version() + self.log.info("Pool Version after exclude %s", pver_exclude) + # Check pool version incremented after pool exclude + # pver_exclude should be greater than + # pver_begin + 8 targets. + self.assertTrue(pver_exclude > (pver_begin + 8), + "Pool Version Error: After exclude") + + # Reintegrate the ranks which was excluded + for val, _ in enumerate(rank): + time.sleep(5) + if (val == 2 and "RP_2G" in oclass): + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + rank[val], + "0,2") + else: + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + rank[val]) + self.log.info(output) + self.is_rebuild_done(3) + self.assert_on_rebuild_failure() - pver_reint = self.get_pool_version() - self.log.info("Pool Version after reintegrate %d", pver_reint) - # Check pool version incremented after pool reintegrate - self.assertTrue(pver_reint > (pver_exclude + 1), - "Pool Version Error: After reintegrate") + pver_reint = self.get_pool_version() + self.log.info("Pool Version after reintegrate %d", pver_reint) + # Check pool version incremented after pool reintegrate + self.assertTrue(pver_reint > (pver_exclude + 1), + "Pool Version Error: After reintegrate") - display_string = "Pool{} space at the End".format(random_pool) - self.pool = pool[random_pool] - self.pool.display_pool_daos_space(display_string) + display_string = "Pool{} space at the End".format(random_pool) + self.pool = pool[random_pool] + self.pool.display_pool_daos_space(display_string) if data: self.run_ior_thread("Read", oclass, test_seq) @@ -151,7 +157,6 @@ def test_osa_offline_reintegration(self): """ self.run_offline_reintegration_test(1, data=True) - @skipForTicket("DAOS-6766, DAOS-6783") def test_osa_offline_reintegration_server_stop(self): """Test ID: DAOS-6748. Test Description: Validate Offline Reintegration with server stop @@ -173,7 +178,6 @@ def test_osa_offline_reintegrate_during_rebuild(self): self.run_offline_reintegration_test(1, data=True, reint_during_rebuild=True) - @skipForTicket("DAOS-6905") def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration @@ -210,3 +214,13 @@ def test_osa_offline_reintegration_multiple_pools(self): :avocado: tags=offline_reintegration_multiple_pools """ self.run_offline_reintegration_test(200, data=True) + + def test_osa_offline_reintegration_loop_test(self): + """Test ID: DAOS-6923 + Test Description: Validate Offline Reintegration + with multiple pools + + :avocado: tags=all,hw,medium,ib2,osa,offline_reintegration + :avocado: tags=offline_reintegration_loop_test + """ + self.run_offline_reintegration_test(1, data=True) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index b29145ef6ad..0882e7da7d2 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -5,7 +5,7 @@ hosts: - server-C test_clients: - client-D -timeout: 1000 +timeout: 3600 server_config: name: daos_server engines_per_host: 2 @@ -101,3 +101,5 @@ test_obj_class: - RP_2G8 - RP_3G6 - RP_4G1 +loop_test: + iterations: 10 From 1ed510eb3bc7a083e88a71c236e5052765ce3072 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Tue, 9 Mar 2021 00:36:23 -0500 Subject: [PATCH 08/37] DAOS-6923 test: Fix checkpatch issues. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 9ba638f3f83..22068863a26 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -24,22 +24,15 @@ def setUp(self): """Set up for test case.""" super(OSAOfflineReintegration, self).setUp() self.dmg_command = self.get_dmg_command() -<<<<<<< HEAD -======= - self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*') ->>>>>>> master - self.ior_test_sequence = self.params.get( - "ior_test_sequence", '/run/ior/iorflags/*') + self.ior_test_sequence = self.params.get("ior_test_sequence", + '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') self.loop_test_cnt = self.params.get("iterations", '/run/loop_test/*') # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) -<<<<<<< HEAD self.dmg_command.exit_status_exception = False -======= ->>>>>>> master def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None, From f2cedae16335e25a73be1f6d5dd2d92b479e3ba0 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Tue, 9 Mar 2021 00:52:56 -0500 Subject: [PATCH 09/37] DAOS-6923 test: Added skipForTicket Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 22068863a26..43825a03a5c 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -9,7 +9,7 @@ from osa_utils import OSAUtils from test_utils_pool import TestPool from write_host_file import write_host_file -# from apricot import skipForTicket +from apricot import skipForTicket class OSAOfflineReintegration(OSAUtils): From 3b2b7b7cb48225cd893b757fde872e15515a4110 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Tue, 9 Mar 2021 19:09:37 -0500 Subject: [PATCH 10/37] DAOS-6923 test: Just run offline (server stop) Test-tag-hw-medium: pr,hw,medium,ib2 osa Skip-unit-tests: true Skip-nlt: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-hw-test-large: true Skip-func-hw-test-small: true Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 4 ++++ src/tests/ftest/osa/osa_offline_reintegration.yaml | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 43825a03a5c..d4949714ab6 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -147,6 +147,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread() + @skipForTicket("DAOS-6505") def test_osa_offline_reintegration(self): """Test ID: DAOS-4749 Test Description: Validate Offline Reintegration @@ -166,6 +167,7 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) + @skipForTicket("DAOS-6505") def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while rebuild @@ -178,6 +180,7 @@ def test_osa_offline_reintegrate_during_rebuild(self): self.run_offline_reintegration_test(1, data=True, reint_during_rebuild=True) + @skipForTicket("DAOS-6505") def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration @@ -192,6 +195,7 @@ def test_osa_offline_reintegration_oclass(self): server_boot=False, oclass=oclass) + @skipForTicket("DAOS-6505") def test_osa_offline_reintegrate_during_aggregation(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while aggregation diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index 0882e7da7d2..a31e002168e 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -81,8 +81,8 @@ ior: mdtest: api: DFS client_processes: - np: 30 - num_of_files_dirs: 4067 # creating total of 120K files + np: 2 + num_of_files_dirs: 100 # creating total of 120K files test_dir: "/" iteration: 1 dfs_destroy: False From 7b83df8b9517d5aa32e6b7ce31078cb5088e680f Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Wed, 10 Mar 2021 23:37:47 -0500 Subject: [PATCH 11/37] DAOS-6923 test: Add the loop testing methods. Test-tag-hw-medium: pr,hw,medium,ib2 osa Skip-unit-tests: true Skip-nlt: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-coverity-test: true Skip-func-hw-test-small: true Skip-func-hw-test-large: true Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 60 +++++++------------ .../ftest/osa/osa_offline_reintegration.yaml | 9 +-- src/tests/ftest/util/osa_utils.py | 34 +++++------ 3 files changed, 42 insertions(+), 61 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index d4949714ab6..bad272f8bee 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -27,12 +27,11 @@ def setUp(self): self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') - self.loop_test_cnt = self.params.get("iterations", - '/run/loop_test/*') + self.loop_test_cnt = 1 # Recreate the client hostfile without slots defined self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) - self.dmg_command.exit_status_exception = False + self.dmg_command.exit_status_exception = True def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None, @@ -98,8 +97,9 @@ def run_offline_reintegration_test(self, num_pool, data=False, rank[val]) else: output = self.dmg_command.system_stop(ranks=rank[val]) - self.pool.wait_for_rebuild(True) self.log.info(output) + self.is_rebuild_done(3) + self.assert_on_rebuild_failure() output = self.dmg_command.system_start(ranks=rank[val]) # Just try to reintegrate rank 5 if (reint_during_rebuild is True and val == 2): @@ -115,9 +115,9 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.log.info("Pool Version after exclude %s", pver_exclude) # Check pool version incremented after pool exclude # pver_exclude should be greater than - # pver_begin + 8 targets. - self.assertTrue(pver_exclude > (pver_begin + 8), - "Pool Version Error: After exclude") + # pver_begin + 3 (2 targets + exclude) + self.assertTrue(pver_exclude > (pver_begin + 3), + "Pool Version Error: After exclude") # Reintegrate the ranks which was excluded for val, _ in enumerate(rank): @@ -143,20 +143,22 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.pool = pool[random_pool] self.pool.display_pool_daos_space(display_string) - if data: - self.run_ior_thread("Read", oclass, test_seq) - self.run_mdtest_thread() + for val in range(0, num_pool): + self.pool = pool[val] + if data: + self.run_ior_thread("Read", oclass, test_seq) + self.run_mdtest_thread() - @skipForTicket("DAOS-6505") - def test_osa_offline_reintegration(self): - """Test ID: DAOS-4749 + def test_osa_offline_reintegration_multiple_pools(self): + """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration + with multiple pools :avocado: tags=all,daily_regression,hw,medium,ib2 :avocado: tags=osa,offline_reintegration - :avocado: tags=offline_reintegration_exclude + :avocado: tags=offline_reintegration_multiple_pools """ - self.run_offline_reintegration_test(1, data=True) + self.run_offline_reintegration_test(5, data=True) def test_osa_offline_reintegration_server_stop(self): """Test ID: DAOS-6748. @@ -167,20 +169,20 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) - @skipForTicket("DAOS-6505") def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while rebuild is happening in parallel - :avocado: tags=all,daily_regression,hw,medium,ib2 + :avocado: tags=all,full_regression,hw,medium,ib2 :avocado: tags=osa,offline_reintegration :avocado: tags=offline_reintegrate_during_rebuild """ + self.loop_test_cnt = self.params.get("iterations", + '/run/loop_test/*') self.run_offline_reintegration_test(1, data=True, reint_during_rebuild=True) - @skipForTicket("DAOS-6505") def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration @@ -195,7 +197,6 @@ def test_osa_offline_reintegration_oclass(self): server_boot=False, oclass=oclass) - @skipForTicket("DAOS-6505") def test_osa_offline_reintegrate_during_aggregation(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while aggregation @@ -207,24 +208,3 @@ def test_osa_offline_reintegrate_during_aggregation(self): """ self.run_offline_reintegration_test(1, data=True, reint_during_aggregation=True) - - @skipForTicket("DAOS-6505") - def test_osa_offline_reintegration_multiple_pools(self): - """Test ID: DAOS-6923 - Test Description: Validate Offline Reintegration - with multiple pools - - :avocado: tags=all,hw,medium,ib2,osa,offline_reintegration - :avocado: tags=offline_reintegration_multiple_pools - """ - self.run_offline_reintegration_test(200, data=True) - - def test_osa_offline_reintegration_loop_test(self): - """Test ID: DAOS-6923 - Test Description: Validate Offline Reintegration - with multiple pools - - :avocado: tags=all,hw,medium,ib2,osa,offline_reintegration - :avocado: tags=offline_reintegration_loop_test - """ - self.run_offline_reintegration_test(1, data=True) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index a31e002168e..1e16e258983 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -5,7 +5,7 @@ hosts: - server-C test_clients: - client-D -timeout: 3600 +timeout: 800 server_config: name: daos_server engines_per_host: 2 @@ -77,12 +77,12 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [6000000000, 54000000000, 500000, 500000000] - - [6000000000, 54000000000, 1000, 500000000] + - [6000000000, 54000000000, 1000, 5000000] mdtest: api: DFS client_processes: np: 2 - num_of_files_dirs: 100 # creating total of 120K files + num_of_files_dirs: 100 test_dir: "/" iteration: 1 dfs_destroy: False @@ -101,5 +101,6 @@ test_obj_class: - RP_2G8 - RP_3G6 - RP_4G1 + - S1 loop_test: - iterations: 10 + iterations: 3 diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 1f31a089be7..2e12d7aef60 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -37,6 +37,7 @@ class OSAUtils(MdtestBase, IorTestBase): def setUp(self): """Set up for test case.""" super(OSAUtils, self).setUp() + self.pool_cont_dict = {} self.container = None self.obj = None self.ioreq = None @@ -76,25 +77,16 @@ def get_rebuild_status(self): return data["rebuild"]["status"] @fail_on(CommandFailure) - def is_rebuild_done(self, time_interval): + def is_rebuild_done(self, time_interval, + wait_for_rebuild_not_to_complete=False): """Rebuild is completed/done. Args: time_interval: Wait interval between checks - Returns: - False: If rebuild_status not "done" or "completed". - True: If rebuild status is "done" or "completed". + wait_for_rebuild_not_to_complete: Rebuild completed + (Default: False) """ - status = False - fail_count = 0 - completion_flag = ["done", "completed"] - while fail_count <= 20: - rebuild_status = self.get_rebuild_status() - time.sleep(time_interval) - fail_count += 1 - if rebuild_status in completion_flag: - status = True - break - return status + self.pool.wait_for_rebuild(wait_for_rebuild_not_to_complete, + interval=time_interval) @fail_on(CommandFailure) def assert_on_rebuild_failure(self): @@ -219,9 +211,17 @@ def ior_thread(self, pool, oclass, test, flags): self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) - # Create container only - if self.container is None: + # If pool is not in the dictionary, + # initialize its container as None. + if self.pool not in self.pool_cont_dict: + self.pool_cont_dict[self.pool] = None + # Create container if the pool doesn't have one. + # Otherwise, use the existing container in the pool. + if self.pool_cont_dict[self.pool] is None: self.add_container(self.pool) + self.pool_cont_dict[self.pool] = self.container + else: + self.container = self.pool_cont_dict[self.pool] job_manager = self.get_ior_job_manager_command() job_manager.job.dfs_cont.update(self.container.uuid) self.ior_cmd.transfer_size.update(test[2]) From 68f975321f8dfe2363d63ea2e08e99c4b05746b5 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 11 Mar 2021 16:06:10 -0500 Subject: [PATCH 12/37] DAOS-6923 test : Added daos cont check support Test-tag-hw-medium: pr,hw,medium,ib2 osa Skip-unit-tests: true Skip-nlt: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-coverity-test: true Skip-func-hw-test-small: true Skip-func-hw-test-medium: true Skip-func-hw-test-large: true Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 9 ++++++++ src/tests/ftest/util/daos_utils.py | 22 +++++++++++++++++++ src/tests/ftest/util/daos_utils_base.py | 11 ++++++++++ src/tests/ftest/util/osa_utils.py | 2 +- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index bad272f8bee..ab040efcba0 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -7,6 +7,7 @@ import random import time from osa_utils import OSAUtils +from daos_utils import DaosCommand from test_utils_pool import TestPool from write_host_file import write_host_file from apricot import skipForTicket @@ -24,6 +25,7 @@ def setUp(self): """Set up for test case.""" super(OSAOfflineReintegration, self).setUp() self.dmg_command = self.get_dmg_command() + self.daos_command = DaosCommand(self.bin) self.ior_test_sequence = self.params.get("ior_test_sequence", '/run/ior/iorflags/*') self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*') @@ -143,11 +145,18 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.pool = pool[random_pool] self.pool.display_pool_daos_space(display_string) + # Finally check whether the written data can be accessed. + # Also, run the daos cont check (for object integrity) for val in range(0, num_pool): self.pool = pool[val] if data: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread() + self.container = self.pool_cont_dict[self.pool] + #kwargs = {"pool": self.pool.uuid, + # "cont": self.container.uuid} + #output = self.daos_command.container_check(**kwargs) + #self.log.info(output) def test_osa_offline_reintegration_multiple_pools(self): """Test ID: DAOS-6923 diff --git a/src/tests/ftest/util/daos_utils.py b/src/tests/ftest/util/daos_utils.py index 7d575170112..94166846313 100644 --- a/src/tests/ftest/util/daos_utils.py +++ b/src/tests/ftest/util/daos_utils.py @@ -131,6 +131,28 @@ def container_destroy(self, pool, cont, force=None, sys_name=None): ("container", "destroy"), pool=pool, sys_name=sys_name, cont=cont, force=force) + def container_check(self, pool, cont, sys_name=None, path=None): + """Check the integrity of container objects. + + Args: + pool (str): UUID of the pool in which to create the container + cont (str): container UUID. + sys_name (str, optional): DAOS system name context for servers. + Defaults to None. + path (str): Container namespace path. Defaults to None + + Returns: + CmdResult: Object that contains exit status, stdout, and other + information. + + Raises: + CommandFailure: if the daos container check command fails. + + """ + return self._get_result( + ("container", "check"), pool=pool, cont=cont, + sys_name=sys_name, path=path) + def container_get_acl(self, pool, cont, verbose=False, outfile=None): """Get the ACL for a given container. diff --git a/src/tests/ftest/util/daos_utils_base.py b/src/tests/ftest/util/daos_utils_base.py index bc010cb12f3..2bccaa05e9c 100644 --- a/src/tests/ftest/util/daos_utils_base.py +++ b/src/tests/ftest/util/daos_utils_base.py @@ -148,6 +148,8 @@ def get_sub_command_class(self): self.sub_command_class = self.CreateSubCommand() elif self.sub_command.value == "destroy": self.sub_command_class = self.DestroySubCommand() + elif self.sub_command.value == "check": + self.sub_command_class = self.CheckSubCommand() elif self.sub_command.value == "list-objects": self.sub_command_class = self.ListObjectsSubCommand() elif self.sub_command.value == "query": @@ -273,6 +275,15 @@ def __init__(self): DaosCommandBase.ContainerSubCommand.QuerySubCommand, self).__init__("query") + class CheckSubCommand(CommonContainerSubCommand): + """Defines an object for the daos container check command.""" + + def __init__(self): + """Create a daos container check command object.""" + super( + DaosCommandBase.ContainerSubCommand.CheckSubCommand, + self).__init__("check") + class GetAclSubCommand(CommonContainerSubCommand): """Defines an object for the daos container get-acl command.""" diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 2e12d7aef60..e5c8bd9b7a1 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -211,7 +211,7 @@ def ior_thread(self, pool, oclass, test, flags): self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) - # If pool is not in the dictionary, + # If pool is not in the dictionary, # initialize its container as None. if self.pool not in self.pool_cont_dict: self.pool_cont_dict[self.pool] = None From 85434c393fbce0ad18843fd859bd1428310c1a87 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 11 Mar 2021 16:17:29 -0500 Subject: [PATCH 13/37] DAOS-6923 test: Merge with master, minor change. Test-tag-hw-medium: pr,hw,medium,ib2 osa Skip-unit-tests: true Skip-nlt: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-coverity-test: true Skip-func-hw-test-small: true Skip-func-hw-test-medium: true Skip-func-hw-test-large: true Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index e5c8bd9b7a1..5c808224500 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -78,14 +78,14 @@ def get_rebuild_status(self): @fail_on(CommandFailure) def is_rebuild_done(self, time_interval, - wait_for_rebuild_not_to_complete=False): + wait_for_rebuild_to_complete=False): """Rebuild is completed/done. Args: time_interval: Wait interval between checks - wait_for_rebuild_not_to_complete: Rebuild completed - (Default: False) + wait_for_rebuild_to_complete: Rebuild completed + (Default: False) """ - self.pool.wait_for_rebuild(wait_for_rebuild_not_to_complete, + self.pool.wait_for_rebuild(wait_for_rebuild_to_complete, interval=time_interval) @fail_on(CommandFailure) From b07b5819522b26e303279e44f721285f373dad1d Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 14 Mar 2021 19:28:31 -0400 Subject: [PATCH 14/37] DAOS-6923 test: Code review script changes. Test-tag-hw-medium: pr,hw,medium,ib2 osa Skip-unit-tests: true Skip-nlt: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-coverity-test: true Skip-func-hw-test-small: true Skip-func-hw-test-medium: true Skip-func-hw-test-large: true Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_drain.py | 20 ++---- src/tests/ftest/osa/osa_offline_drain.yaml | 3 + .../ftest/osa/osa_offline_reintegration.py | 70 ++++++++----------- .../ftest/osa/osa_offline_reintegration.yaml | 5 +- src/tests/ftest/util/osa_utils.py | 62 ++++++++++++++-- 5 files changed, 99 insertions(+), 61 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_drain.py b/src/tests/ftest/osa/osa_offline_drain.py index 746200e9595..0e2aca0e1d2 100644 --- a/src/tests/ftest/osa/osa_offline_drain.py +++ b/src/tests/ftest/osa/osa_offline_drain.py @@ -30,15 +30,13 @@ def setUp(self): self.hostlist_clients, self.workdir, None) def run_offline_drain_test(self, num_pool, data=False, - oclass=None, drain_during_aggregation=False): + oclass=None): """Run the offline drain without data. Args: num_pool (int) : total pools to create for testing purposes. data (bool) : whether pool has no data or to create some data in pool. Defaults to False. oclass (str): DAOS object class (eg: RP_2G1,etc) - drain_during_aggregation (bool) : Perform drain and aggregation - in parallel """ # Create a pool pool = {} @@ -66,11 +64,8 @@ def run_offline_drain_test(self, num_pool, data=False, num_pool) pool[val].create() self.pool = pool[val] - if drain_during_aggregation is True: - test_seq = self.ior_test_sequence[1] - self.pool.set_property("reclaim", "disabled") - else: - test_seq = self.ior_test_sequence[0] + self.pool.set_property("reclaim", "disabled") + test_seq = self.ior_test_sequence[0] if data: self.run_ior_thread("Write", oclass, test_seq) @@ -83,14 +78,13 @@ def run_offline_drain_test(self, num_pool, data=False, self.pool.display_pool_daos_space("Pool space: Beginning") pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) - if drain_during_aggregation is True: + if self.test_during_aggregation is True: self.pool.set_property("reclaim", "time") - time.sleep(90) + self.delete_extra_container(self.pool) + self.simple_exclude_reintegrate_loop(rank) output = self.dmg_command.pool_drain(self.pool.uuid, rank, t_string) - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() + self.print_and_assert_on_rebuild_failure(output) pver_drain = self.get_pool_version() self.log.info("Pool Version after drain %d", pver_drain) diff --git a/src/tests/ftest/osa/osa_offline_drain.yaml b/src/tests/ftest/osa/osa_offline_drain.yaml index d8c6a1a52bd..1acf88f6c19 100644 --- a/src/tests/ftest/osa/osa_offline_drain.yaml +++ b/src/tests/ftest/osa/osa_offline_drain.yaml @@ -105,3 +105,6 @@ test_obj_class: - RP_2G8 - RP_3G6 - RP_4G1 +aggregation: + test_with_aggregation: True + diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index ab040efcba0..04f2fc939d4 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -36,9 +36,7 @@ def setUp(self): self.dmg_command.exit_status_exception = True def run_offline_reintegration_test(self, num_pool, data=False, - server_boot=False, oclass=None, - reint_during_rebuild=False, - reint_during_aggregation=False): + server_boot=False, oclass=None): """Run the offline reintegration without data. Args: num_pool (int) : total pools to create for testing purposes. @@ -47,11 +45,6 @@ def run_offline_reintegration_test(self, num_pool, data=False, server_boot (bool) : Perform system stop/start on a rank. Defults to False. oclass (str) : daos object class string (eg: "RP_2G8") - reint_during_rebuild (bool) : Perform reintegration during - rebuild (Defaults to False). - reint_during_aggregation (bool) : Perform reintegration - during aggregation - (Defaults to False). """ # Create a pool pool = {} @@ -67,14 +60,13 @@ def run_offline_reintegration_test(self, num_pool, data=False, pool[val].get_params(self) pool[val].create() self.pool = pool[val] - if reint_during_aggregation is True: - test_seq = self.ior_test_sequence[1] - self.pool.set_property("reclaim", "disabled") - else: - test_seq = self.ior_test_sequence[0] + self.pool.set_property("reclaim", "disabled") + test_seq = self.ior_test_sequence[0] if data: self.run_ior_thread("Write", oclass, test_seq) self.run_mdtest_thread() + if self.test_during_aggregation is True: + self.run_ior_thread("Write", oclass, test_seq) # Exclude all the ranks random_pool = random.randint(0, (num_pool-1)) @@ -85,33 +77,26 @@ def run_offline_reintegration_test(self, num_pool, data=False, pver_begin = self.get_pool_version() self.log.info("Pool Version at the beginning %s", pver_begin) if server_boot is False: - if (reint_during_rebuild is True and val == 0): + if (self.test_during_rebuild is True and val == 0): # Exclude rank 5 output = self.dmg_command.pool_exclude(self.pool.uuid, "5") - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() - if reint_during_aggregation is True: - self.pool.set_property("reclaim", "time") - time.sleep(90) + self.print_and_assert_on_rebuild_failure(output) + if self.test_during_aggregation is True: + self.delete_extra_container(self.pool) + self.simple_exclude_reintegrate_loop(rank[val]) output = self.dmg_command.pool_exclude(self.pool.uuid, rank[val]) else: output = self.dmg_command.system_stop(ranks=rank[val]) - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() + self.print_and_assert_on_rebuild_failure(output) output = self.dmg_command.system_start(ranks=rank[val]) # Just try to reintegrate rank 5 - if (reint_during_rebuild is True and val == 2): - # Exclude rank 5 - time.sleep(3) + if (self.test_during_rebuild is True and val == 2): + # Reintegrate rank 5 output = self.dmg_command.pool_reintegrate(self.pool.uuid, "5") - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() + self.print_and_assert_on_rebuild_failure(output) pver_exclude = self.get_pool_version() self.log.info("Pool Version after exclude %s", pver_exclude) @@ -123,7 +108,6 @@ def run_offline_reintegration_test(self, num_pool, data=False, # Reintegrate the ranks which was excluded for val, _ in enumerate(rank): - time.sleep(5) if (val == 2 and "RP_2G" in oclass): output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank[val], @@ -131,9 +115,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, else: output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank[val]) - self.log.info(output) - self.is_rebuild_done(3) - self.assert_on_rebuild_failure() + self.print_and_assert_on_rebuild_failure(output) pver_reint = self.get_pool_version() self.log.info("Pool Version after reintegrate %d", pver_reint) @@ -152,11 +134,12 @@ def run_offline_reintegration_test(self, num_pool, data=False, if data: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread() - self.container = self.pool_cont_dict[self.pool] - #kwargs = {"pool": self.pool.uuid, - # "cont": self.container.uuid} - #output = self.daos_command.container_check(**kwargs) - #self.log.info(output) + if self.test_during_rebuild is True: + self.container = self.pool_cont_dict[self.pool] + kwargs = {"pool": self.pool.uuid, + "cont": self.container.uuid} + output = self.daos_command.container_check(**kwargs) + self.log.info(output) def test_osa_offline_reintegration_multiple_pools(self): """Test ID: DAOS-6923 @@ -178,6 +161,7 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) + @skipForTicket("DAOS-7013") def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while rebuild @@ -189,8 +173,9 @@ def test_osa_offline_reintegrate_during_rebuild(self): """ self.loop_test_cnt = self.params.get("iterations", '/run/loop_test/*') - self.run_offline_reintegration_test(1, data=True, - reint_during_rebuild=True) + self.test_during_rebuild = self.params.get("test_with_rebuild", + '/run/rebuild/*') + self.run_offline_reintegration_test(1, data=True) def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 @@ -215,5 +200,6 @@ def test_osa_offline_reintegrate_during_aggregation(self): :avocado: tags=osa,offline_reintegration :avocado: tags=offline_reintegrate_during_aggregation """ - self.run_offline_reintegration_test(1, data=True, - reint_during_aggregation=True) + self.test_during_aggregation = self.params.get("test_with_aggregation", + '/run/aggregation/*') + self.run_offline_reintegration_test(1, data=True) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index 1e16e258983..fe52612e1b7 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -77,7 +77,6 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [6000000000, 54000000000, 500000, 500000000] - - [6000000000, 54000000000, 1000, 5000000] mdtest: api: DFS client_processes: @@ -104,3 +103,7 @@ test_obj_class: - S1 loop_test: iterations: 3 +aggregation: + test_with_aggregation: True +rebuild: + test_with_rebuild: True diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 5c808224500..ff3f31c19eb 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -53,6 +53,8 @@ def setUp(self): self.ior_r_flags = self.params.get("read_flags", '/run/ior/iorflags/*') self.out_queue = test_queue.Queue() self.dmg_command.exit_status_exception = False + self.test_during_aggregation = False + self.test_during_rebuild = False @fail_on(CommandFailure) def get_pool_leader(self): @@ -99,6 +101,15 @@ def assert_on_rebuild_failure(self): self.assertTrue(rebuild_status not in rebuild_failed_string, "Rebuild failed") + @fail_on(CommandFailure) + def print_and_assert_on_rebuild_failure(self, out, timeout=3): + """Print the out value (daos, dmg, etc) and check for rebuild + completion. If not, raise assert. + """ + self.log.info(out) + self.is_rebuild_done(timeout) + self.assert_on_rebuild_failure() + @fail_on(CommandFailure) def get_pool_version(self): """Get the pool version. @@ -110,6 +121,21 @@ def get_pool_version(self): data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) + def simple_exclude_reintegrate_loop(self, rank, loop_time=100): + """This method performs exclude and reintegration on a rank, + for a certain amount of time. + """ + start_time = 0 + finish_time = 0 + while (int(finish_time - start_time) > loop_time): + start_time = time.time() + output = self.dmg_command.pool_exclude(self.pool.uuid, + rank) + self.print_and_assert_on_rebuild_failure(output) + output = self.dmg_command.pool_reintegrate(self.pool.uuid, + rank) + self.print_and_assert_on_rebuild_failure(output) + @fail_on(DaosApiError) def write_single_object(self): """Write some data to the existing pool.""" @@ -170,6 +196,16 @@ def verify_single_object(self): self.obj.close() self.container.close() + def delete_extra_container(self, pool): + """Delete the extra container in the pool. + Args: + pool (object): pool handle + """ + self.pool.set_property("reclaim", "time") + extra_container = self.pool_cont_dict[pool][2] + extra_container.destroy() + self.pool_cont_dict[pool][3] = None + def run_ior_thread(self, action, oclass, test): """Start the IOR thread for either writing or reading data to/from a container. @@ -211,17 +247,33 @@ def ior_thread(self, pool, oclass, test, flags): self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) + self.log.info(self.pool_cont_dict) # If pool is not in the dictionary, - # initialize its container as None. + # initialize its container list to None + # {poolA : [None, None], [None, None]} if self.pool not in self.pool_cont_dict: - self.pool_cont_dict[self.pool] = None + self.pool_cont_dict[self.pool] = [None] * 4 # Create container if the pool doesn't have one. # Otherwise, use the existing container in the pool. - if self.pool_cont_dict[self.pool] is None: + # pool_cont_dict {pool A: [containerA, Updated, + # containerB, Updated], + # pool B : containerA, Updated, + # containerB, None]} + if self.pool_cont_dict[self.pool][0] is None: self.add_container(self.pool) - self.pool_cont_dict[self.pool] = self.container + self.pool_cont_dict[self.pool][0] = self.container + self.pool_cont_dict[self.pool][1] = "Updated" else: - self.container = self.pool_cont_dict[self.pool] + if ((self.test_during_aggregation is True) and + (self.pool_cont_dict[self.pool][1] == "Updated") and + (self.pool_cont_dict[self.pool][3] is None) and + ("-w" in flags)): + # Write to the second container + self.add_container(self.pool) + self.pool_cont_dict[self.pool][2] = self.container + self.pool_cont_dict[self.pool][3] = "Updated" + else: + self.container = self.pool_cont_dict[self.pool][0] job_manager = self.get_ior_job_manager_command() job_manager.job.dfs_cont.update(self.container.uuid) self.ior_cmd.transfer_size.update(test[2]) From 82980768216783c6a43eff1f614136c6c4cef6b3 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 14 Mar 2021 19:37:23 -0400 Subject: [PATCH 15/37] DAOS-6923 test: Fix minor checkpatch issues. Test-tag-hw-medium: pr,hw,medium,ib2 osa Skip-unit-tests: true Skip-nlt: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-coverity-test: true Skip-func-hw-test-small: true Skip-func-hw-test-medium: true Skip-func-hw-test-large: true Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_drain.py | 1 - src/tests/ftest/osa/osa_offline_drain.yaml | 1 - src/tests/ftest/osa/osa_offline_reintegration.py | 1 - src/tests/ftest/util/osa_utils.py | 2 +- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_drain.py b/src/tests/ftest/osa/osa_offline_drain.py index 0e2aca0e1d2..ec4bca6eb5e 100644 --- a/src/tests/ftest/osa/osa_offline_drain.py +++ b/src/tests/ftest/osa/osa_offline_drain.py @@ -5,7 +5,6 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import random -import time from osa_utils import OSAUtils from test_utils_pool import TestPool from write_host_file import write_host_file diff --git a/src/tests/ftest/osa/osa_offline_drain.yaml b/src/tests/ftest/osa/osa_offline_drain.yaml index 1acf88f6c19..2d8cde44a73 100644 --- a/src/tests/ftest/osa/osa_offline_drain.yaml +++ b/src/tests/ftest/osa/osa_offline_drain.yaml @@ -81,7 +81,6 @@ ior: # The values are set to be in the multiples of 10. # Values are appx GB. - [6000000000, 54000000000, 500000, 500000000] - - [6000000000, 54000000000, 1000, 500000000] mdtest: api: DFS client_processes: diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 04f2fc939d4..f410bcbceb9 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -5,7 +5,6 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import random -import time from osa_utils import OSAUtils from daos_utils import DaosCommand from test_utils_pool import TestPool diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index ff3f31c19eb..cdc8c57db7a 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -127,7 +127,7 @@ def simple_exclude_reintegrate_loop(self, rank, loop_time=100): """ start_time = 0 finish_time = 0 - while (int(finish_time - start_time) > loop_time): + while int(finish_time - start_time) > loop_time: start_time = time.time() output = self.dmg_command.pool_exclude(self.pool.uuid, rank) From 5ac7f7571805bb5537b2041fcb488352307fdc03 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Tue, 16 Mar 2021 17:36:20 -0400 Subject: [PATCH 16/37] DAOS-6923 test: Update the container class Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 5 ++- src/tests/ftest/util/osa_utils.py | 36 ++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index f410bcbceb9..b54b268c9a0 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -9,7 +9,6 @@ from daos_utils import DaosCommand from test_utils_pool import TestPool from write_host_file import write_host_file -from apricot import skipForTicket class OSAOfflineReintegration(OSAUtils): @@ -87,7 +86,8 @@ def run_offline_reintegration_test(self, num_pool, data=False, output = self.dmg_command.pool_exclude(self.pool.uuid, rank[val]) else: - output = self.dmg_command.system_stop(ranks=rank[val]) + output = self.dmg_command.system_stop(ranks=rank[val], + force=True) self.print_and_assert_on_rebuild_failure(output) output = self.dmg_command.system_start(ranks=rank[val]) # Just try to reintegrate rank 5 @@ -160,7 +160,6 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) - @skipForTicket("DAOS-7013") def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while rebuild diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index cdc8c57db7a..75d2ef116e2 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -7,6 +7,7 @@ import ctypes import time import threading +import re from avocado import fail_on from ior_test_base import IorTestBase @@ -206,6 +207,25 @@ def delete_extra_container(self, pool): extra_container.destroy() self.pool_cont_dict[pool][3] = None + def set_cont_class_properties(self, cont, oclass="S1"): + """Update the container class to match the IOR object + class. Also, remove the redundancy factor for S type + object class. + Args: + cont (object): TestContainer object + oclass (str, optional): Container object class to be set. + Defaults to "S1". + """ + self.container.oclass.value = oclass + # Set the container properties properly for S!, S2 class. + # rf should not be set to 1 for S type object class. + + x = re.search("^S\\d$", oclass) + if x is not None: + prop = self.container.properties.value + prop = prop.replace("rf:1", "rf:0") + self.container.properties.value = prop + def run_ior_thread(self, action, oclass, test): """Start the IOR thread for either writing or reading data to/from a container. @@ -237,7 +257,7 @@ def ior_thread(self, pool, oclass, test, flags): Args: pool (object): pool handle - oclass (str): IOR object class + oclass (str): IOR object class, container class. test (list): IOR test sequence flags (str): IOR flags @@ -247,6 +267,7 @@ def ior_thread(self, pool, oclass, test, flags): self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) + self.log.info(self.pool_cont_dict) # If pool is not in the dictionary, # initialize its container list to None @@ -260,7 +281,9 @@ def ior_thread(self, pool, oclass, test, flags): # pool B : containerA, Updated, # containerB, None]} if self.pool_cont_dict[self.pool][0] is None: - self.add_container(self.pool) + self.add_container(self.pool, create=False) + self.set_cont_class_properties(self.container, oclass) + self.container.create() self.pool_cont_dict[self.pool][0] = self.container self.pool_cont_dict[self.pool][1] = "Updated" else: @@ -269,7 +292,9 @@ def ior_thread(self, pool, oclass, test, flags): (self.pool_cont_dict[self.pool][3] is None) and ("-w" in flags)): # Write to the second container - self.add_container(self.pool) + self.add_container(self.pool, create=False) + self.set_cont_class_properties(self.container, oclass) + self.container.create() self.pool_cont_dict[self.pool][2] = self.container self.pool_cont_dict[self.pool][3] = "Updated" else: @@ -287,7 +312,10 @@ def run_mdtest_thread(self): # Create container only self.mdtest_cmd.dfs_destroy = False if self.container is None: - self.add_container(self.pool) + self.add_container(self.pool, create=False) + self.set_cont_class_properties(self.container, + self.mdtest_cmd.dfs_oclass) + self.container.create() job_manager = self.get_mdtest_job_manager_command(self.manager) job_manager.job.dfs_cont.update(self.container.uuid) # Add a thread for these IOR arguments From fecfde79c15757b23f0479c4a76d981c051e651b Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Tue, 16 Mar 2021 17:48:19 -0400 Subject: [PATCH 17/37] DAOS-6923 test: Fix checkpatch issues. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 75d2ef116e2..8bc6a6a01db 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -13,9 +13,6 @@ from ior_test_base import IorTestBase from mdtest_test_base import MdtestBase from command_utils import CommandFailure -from ior_utils import IorCommand -from job_manager_utils import Mpirun -from mpio_utils import MpioUtils from pydaos.raw import (DaosContainer, IORequest, DaosObj, DaosApiError) @@ -207,19 +204,17 @@ def delete_extra_container(self, pool): extra_container.destroy() self.pool_cont_dict[pool][3] = None - def set_cont_class_properties(self, cont, oclass="S1"): + def set_cont_class_properties(self, oclass="S1"): """Update the container class to match the IOR object class. Also, remove the redundancy factor for S type object class. Args: - cont (object): TestContainer object oclass (str, optional): Container object class to be set. Defaults to "S1". """ self.container.oclass.value = oclass # Set the container properties properly for S!, S2 class. # rf should not be set to 1 for S type object class. - x = re.search("^S\\d$", oclass) if x is not None: prop = self.container.properties.value @@ -267,7 +262,7 @@ def ior_thread(self, pool, oclass, test, flags): self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) - + self.log.info(self.pool_cont_dict) # If pool is not in the dictionary, # initialize its container list to None @@ -282,7 +277,7 @@ def ior_thread(self, pool, oclass, test, flags): # containerB, None]} if self.pool_cont_dict[self.pool][0] is None: self.add_container(self.pool, create=False) - self.set_cont_class_properties(self.container, oclass) + self.set_cont_class_properties(oclass) self.container.create() self.pool_cont_dict[self.pool][0] = self.container self.pool_cont_dict[self.pool][1] = "Updated" @@ -293,7 +288,7 @@ def ior_thread(self, pool, oclass, test, flags): ("-w" in flags)): # Write to the second container self.add_container(self.pool, create=False) - self.set_cont_class_properties(self.container, oclass) + self.set_cont_class_properties(oclass) self.container.create() self.pool_cont_dict[self.pool][2] = self.container self.pool_cont_dict[self.pool][3] = "Updated" @@ -313,8 +308,7 @@ def run_mdtest_thread(self): self.mdtest_cmd.dfs_destroy = False if self.container is None: self.add_container(self.pool, create=False) - self.set_cont_class_properties(self.container, - self.mdtest_cmd.dfs_oclass) + self.set_cont_class_properties(self.mdtest_cmd.dfs_oclass) self.container.create() job_manager = self.get_mdtest_job_manager_command(self.manager) job_manager.job.dfs_cont.update(self.container.uuid) From b507f47467b612d27835324a570360bed8b01173 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Wed, 17 Mar 2021 14:04:54 -0400 Subject: [PATCH 18/37] DAOS-6923 test: Support single/multiple containers Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 90 ++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 32 deletions(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 8bc6a6a01db..37e7860aaf6 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -194,8 +194,56 @@ def verify_single_object(self): self.obj.close() self.container.close() + def prepare_cont_ior_write_read(self, oclass, flags): + """This method prepares the containers for + IOR write and read invocations. + To enable aggregation: + - Create two containers and read always from + first container + Normal usage (use only a single container): + - Create a single container and use the same. + Args: + oclass (str): IOR object class + flags (str): IOR flags + """ + self.log.info(self.pool_cont_dict) + # If pool is not in the dictionary, + # initialize its container list to None + # {poolA : [None, None], [None, None]} + if self.pool not in self.pool_cont_dict: + self.pool_cont_dict[self.pool] = [None] * 4 + # Create container if the pool doesn't have one. + # Otherwise, use the existing container in the pool. + # pool_cont_dict {pool A: [containerA, Updated, + # containerB, Updated], + # pool B : containerA, Updated, + # containerB, None]} + if self.pool_cont_dict[self.pool][0] is None: + self.add_container(self.pool, create=False) + self.set_cont_class_properties(oclass) + self.container.create() + self.pool_cont_dict[self.pool][0] = self.container + self.pool_cont_dict[self.pool][1] = "Updated" + else: + if ((self.test_during_aggregation is True) and + (self.pool_cont_dict[self.pool][1] == "Updated") and + (self.pool_cont_dict[self.pool][3] is None) and + ("-w" in flags)): + # Write to the second container + self.add_container(self.pool, create=False) + self.set_cont_class_properties(oclass) + self.container.create() + self.pool_cont_dict[self.pool][2] = self.container + self.pool_cont_dict[self.pool][3] = "Updated" + else: + self.container = self.pool_cont_dict[self.pool][0] + + def delete_extra_container(self, pool): """Delete the extra container in the pool. + Refer prepare_cont_ior_write_read. This method + should be called when OSA tests intend to + enable aggregation. Args: pool (object): pool handle """ @@ -247,7 +295,8 @@ def run_ior_thread(self, action, oclass, test): # Wait for the thread to finish process.join() - def ior_thread(self, pool, oclass, test, flags): + def ior_thread(self, pool, oclass, test, flags, + single_cont_read=True): """Start threads and wait until all threads are finished. Args: @@ -262,38 +311,15 @@ def ior_thread(self, pool, oclass, test, flags): self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) - - self.log.info(self.pool_cont_dict) - # If pool is not in the dictionary, - # initialize its container list to None - # {poolA : [None, None], [None, None]} - if self.pool not in self.pool_cont_dict: - self.pool_cont_dict[self.pool] = [None] * 4 - # Create container if the pool doesn't have one. - # Otherwise, use the existing container in the pool. - # pool_cont_dict {pool A: [containerA, Updated, - # containerB, Updated], - # pool B : containerA, Updated, - # containerB, None]} - if self.pool_cont_dict[self.pool][0] is None: - self.add_container(self.pool, create=False) - self.set_cont_class_properties(oclass) - self.container.create() - self.pool_cont_dict[self.pool][0] = self.container - self.pool_cont_dict[self.pool][1] = "Updated" + if single_cont_read is True and self.container is None: + # Prepare the containers created and use in a specific + # way defined in prepare_cont_ior_write. + self.prepare_cont_ior_write_read(oclass, flags) + elif single_cont_read is False and self.container is not None: + # Here self.container is having actual value. Just use it. + self.log.info(self.container) else: - if ((self.test_during_aggregation is True) and - (self.pool_cont_dict[self.pool][1] == "Updated") and - (self.pool_cont_dict[self.pool][3] is None) and - ("-w" in flags)): - # Write to the second container - self.add_container(self.pool, create=False) - self.set_cont_class_properties(oclass) - self.container.create() - self.pool_cont_dict[self.pool][2] = self.container - self.pool_cont_dict[self.pool][3] = "Updated" - else: - self.container = self.pool_cont_dict[self.pool][0] + self.fail("Not supported option on ior_thread") job_manager = self.get_ior_job_manager_command() job_manager.job.dfs_cont.update(self.container.uuid) self.ior_cmd.transfer_size.update(test[2]) From f4b3a4353dd4989ff312e6e51230b7e08086880c Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 18 Mar 2021 18:24:31 -0400 Subject: [PATCH 19/37] DAOS-6923 test: Minor changes to osa_utils.py Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 38 ++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 37e7860aaf6..ad79e80cc3b 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -118,6 +118,14 @@ def get_pool_version(self): """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["version"]) + + def set_container(self, container): + """Set the OSA utils container object. + Args: + container (obj) : Container object to be used + within OSA utils. + """ + self.container = container def simple_exclude_reintegrate_loop(self, rank, loop_time=100): """This method performs exclude and reintegration on a rank, @@ -238,7 +246,6 @@ def prepare_cont_ior_write_read(self, oclass, flags): else: self.container = self.pool_cont_dict[self.pool][0] - def delete_extra_container(self, pool): """Delete the extra container in the pool. Refer prepare_cont_ior_write_read. This method @@ -269,7 +276,9 @@ def set_cont_class_properties(self, oclass="S1"): prop = prop.replace("rf:1", "rf:0") self.container.properties.value = prop - def run_ior_thread(self, action, oclass, test): + def run_ior_thread(self, action, oclass, test, + single_cont_read=True, + fail_on_warning=True): """Start the IOR thread for either writing or reading data to/from a container. Args: @@ -278,6 +287,12 @@ def run_ior_thread(self, action, oclass, test): oclass (str): IOR object class test (list): IOR test sequence flags (str): IOR flags + single_cont_read (bool) : Always read from the + 1st container. + Defaults to True. + fail_on_warning (bool) : Test terminates + for IOR warnings. + Defaults to True. """ if action == "Write": flags = self.ior_w_flags @@ -289,14 +304,19 @@ def run_ior_thread(self, action, oclass, test): kwargs={"pool": self.pool, "oclass": oclass, "test": test, - "flags": flags}) + "flags": flags, + "single_cont_read": + single_cont_read, + "fail_on_warning": + fail_on_warning}) # Launch the IOR thread process.start() # Wait for the thread to finish process.join() def ior_thread(self, pool, oclass, test, flags, - single_cont_read=True): + single_cont_read=True, + fail_on_warning=True): """Start threads and wait until all threads are finished. Args: @@ -304,7 +324,12 @@ def ior_thread(self, pool, oclass, test, flags, oclass (str): IOR object class, container class. test (list): IOR test sequence flags (str): IOR flags - + single_cont_read (bool) : Always read from the + 1st container. + Defaults to True. + fail_on_warning (bool) : Test terminates + for IOR warnings. + Defaults to True. """ self.pool = pool self.ior_cmd.get_params(self) @@ -325,7 +350,8 @@ def ior_thread(self, pool, oclass, test, flags, self.ior_cmd.transfer_size.update(test[2]) self.ior_cmd.block_size.update(test[3]) self.ior_cmd.flags.update(flags) - self.run_ior_with_pool(create_pool=False, create_cont=False) + self.run_ior_with_pool(create_pool=False, create_cont=False, + fail_on_warning=fail_on_warning) def run_mdtest_thread(self): """Start mdtest thread and wait until thread completes. From ff074fc055ccaaae1060dfadc6cb097e5189f22a Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 18 Mar 2021 19:01:31 -0400 Subject: [PATCH 20/37] DAOS-6923 test: Fix minor checkpatch issues. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 266a263a0d6..6fc602d0edb 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -118,7 +118,7 @@ def get_pool_version(self): """ data = self.dmg_command.pool_query(self.pool.uuid) return int(data["response"]["version"]) - + def set_container(self, container): """Set the OSA utils container object. Args: @@ -287,7 +287,7 @@ def run_ior_thread(self, action, oclass, test, oclass (str): IOR object class test (list): IOR test sequence flags (str): IOR flags - single_cont_read (bool) : Always read from the + single_cont_read (bool) : Always read from the 1st container. Defaults to True. fail_on_warning (bool) : Test terminates @@ -324,7 +324,7 @@ def ior_thread(self, pool, oclass, test, flags, oclass (str): IOR object class, container class. test (list): IOR test sequence flags (str): IOR flags - single_cont_read (bool) : Always read from the + single_cont_read (bool) : Always read from the 1st container. Defaults to True. fail_on_warning (bool) : Test terminates From 189be593d1c7c3dfc4068a4fa202c43472dae113 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 18 Mar 2021 21:42:08 -0400 Subject: [PATCH 21/37] DAOS-6923 test: Fix the ior_thread issue. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 6fc602d0edb..e96ea81babb 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -336,7 +336,7 @@ def ior_thread(self, pool, oclass, test, flags, self.ior_cmd.set_daos_params(self.server_group, self.pool) self.ior_cmd.dfs_oclass.update(oclass) self.ior_cmd.dfs_dir_oclass.update(oclass) - if single_cont_read is True and self.container is None: + if single_cont_read is True: # Prepare the containers created and use in a specific # way defined in prepare_cont_ior_write. self.prepare_cont_ior_write_read(oclass, flags) From a557e807ed287ad9e3cfdc97aa9fd2caf070679e Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Mon, 22 Mar 2021 15:42:01 -0400 Subject: [PATCH 22/37] DAOS-6923 test: Added skipForTicket (DAOS-6925) Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index fd89a6f0a70..910a2c90bb6 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -9,6 +9,7 @@ from daos_utils import DaosCommand from test_utils_pool import TestPool from write_host_file import write_host_file +from apricot import skipForTicket class OSAOfflineReintegration(OSAUtils): @@ -34,6 +35,7 @@ def setUp(self): self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) self.dmg_command.exit_status_exception = True + self.pool_cont_dict = {} def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None): @@ -88,10 +90,16 @@ def run_offline_reintegration_test(self, num_pool, data=False, self.simple_exclude_reintegrate_loop(rank[val]) output = self.dmg_command.pool_exclude(self.pool.uuid, rank[val]) + # Check the IOR data after exclude + if data: + self.run_ior_thread("Read", oclass, test_seq) else: output = self.dmg_command.system_stop(ranks=rank[val], force=True) self.print_and_assert_on_rebuild_failure(output) + # Check the IOR data after system stop + if data: + self.run_ior_thread("Read", oclass, test_seq) output = self.dmg_command.system_start(ranks=rank[val]) # Just try to reintegrate rank 5 if (self.test_during_rebuild is True and val == 2): @@ -136,12 +144,11 @@ def run_offline_reintegration_test(self, num_pool, data=False, if data: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread() - if self.test_during_rebuild is True: - self.container = self.pool_cont_dict[self.pool] - kwargs = {"pool": self.pool.uuid, - "cont": self.container.uuid} - output = self.daos_command.container_check(**kwargs) - self.log.info(output) + self.container = self.pool_cont_dict[self.pool][0] + kwargs = {"pool": self.pool.uuid, + "cont": self.container.uuid} + output = self.daos_command.container_check(**kwargs) + self.log.info(output) def test_osa_offline_reintegration_multiple_pools(self): """Test ID: DAOS-6923 @@ -179,6 +186,7 @@ def test_osa_offline_reintegrate_during_rebuild(self): '/run/rebuild/*') self.run_offline_reintegration_test(1, data=True) + @skipForTicket("DAOS-6925") def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration From 76842210d5438f41a6bb2f982e3aa9a6c9ba1efd Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Mon, 22 Mar 2021 17:26:12 -0400 Subject: [PATCH 23/37] DAOS-6923 test: Removed unwanted variable. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 910a2c90bb6..3c7a3ba38aa 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -35,7 +35,6 @@ def setUp(self): self.hostfile_clients = write_host_file( self.hostlist_clients, self.workdir, None) self.dmg_command.exit_status_exception = True - self.pool_cont_dict = {} def run_offline_reintegration_test(self, num_pool, data=False, server_boot=False, oclass=None): From 33406afa7c15ffc2d11e89b058da70b878e6404b Mon Sep 17 00:00:00 2001 From: Di Wang <di.wang@intel.com> Date: Tue, 16 Mar 2021 06:28:51 +0000 Subject: [PATCH 24/37] DAOS-5758 pl: fixes for placement 1: Add allow_status to layout generation, so only the target with allow status can be existed in the layout. And remove op_type/for_reint and some duplication to make those placement algorithm easier to follow. 2. During reintegration reclaim process, it needs to compare the shard id as well, i.e. if shard id is different, the object needs to be deleted as well. 3. In particular, fix find_reint and main placement APIs returning too many items when the pool map contains simultaneous reintegration, drain, and failure operations. This is a possible real-world scenario that would be triggered when a reintegration is running and something fails. 4. re-enable a few placement tests and add multiple simultaneous states tests. Signed-off-by: Di Wang <di.wang@intel.com> Signed-off-by: Byron Marohn <byron.marohn@intel.com> --- src/common/pool_map.c | 4 + src/include/daos/placement.h | 3 +- src/include/daos/pool_map.h | 6 + src/placement/jump_map.c | 524 +++++++++++------------ src/placement/pl_map.c | 4 +- src/placement/pl_map.h | 4 +- src/placement/pl_map_common.c | 21 +- src/placement/ring_map.c | 3 +- src/placement/tests/jump_map_place_obj.c | 145 ++++++- src/rebuild/scan.c | 3 +- src/tests/suite/daos_rebuild_simple.c | 2 - 11 files changed, 414 insertions(+), 305 deletions(-) diff --git a/src/common/pool_map.c b/src/common/pool_map.c index f39029727d9..7a678bc8277 100644 --- a/src/common/pool_map.c +++ b/src/common/pool_map.c @@ -104,6 +104,10 @@ static struct pool_comp_state_dict comp_state_dict[] = { .sd_state = PO_COMP_ST_NEW, .sd_name = "NEW", }, + { + .sd_state = PO_COMP_ST_DRAIN, + .sd_name = "DRAIN", + }, { .sd_state = PO_COMP_ST_UNKNOWN, .sd_name = "UNKNOWN", diff --git a/src/include/daos/placement.h b/src/include/daos/placement.h index 263032eb4b4..d9f5b1d4f95 100644 --- a/src/include/daos/placement.h +++ b/src/include/daos/placement.h @@ -121,7 +121,8 @@ void pl_obj_layout_free(struct pl_obj_layout *layout); int pl_obj_layout_alloc(unsigned int grp_size, unsigned int grp_nr, struct pl_obj_layout **layout_pp); bool pl_obj_layout_contains(struct pool_map *map, struct pl_obj_layout *layout, - uint32_t rank, uint32_t target_index); + uint32_t rank, uint32_t target_index, + uint32_t shard); int pl_obj_place(struct pl_map *map, struct daos_obj_md *md, diff --git a/src/include/daos/pool_map.h b/src/include/daos/pool_map.h index 614a7d2d3f1..13006ea42f7 100644 --- a/src/include/daos/pool_map.h +++ b/src/include/daos/pool_map.h @@ -313,6 +313,12 @@ pool_target_unavail(struct pool_target *tgt, bool for_reint) return pool_component_unavail(&tgt->ta_comp, for_reint); } +static inline bool +pool_target_avail(struct pool_target *tgt, uint32_t allow_status) +{ + return tgt->ta_comp.co_status & allow_status; +} + /** Check if the target is in PO_COMP_ST_DOWN status */ static inline bool pool_target_down(struct pool_target *tgt) diff --git a/src/placement/jump_map.c b/src/placement/jump_map.c index 9995a9b3ac5..8f2d6841d11 100644 --- a/src/placement/jump_map.c +++ b/src/placement/jump_map.c @@ -60,26 +60,6 @@ struct pl_jump_map { pool_comp_type_t jmp_redundant_dom; }; -/** - * This functions determines whether the object layout should be extended or - * not based on the operation performed and the target status. - * - * \param[in] op The operation being performed - * \param[in] status The component status. - * - * \return True if the layout should be extended, - * False otherwise. - */ -static inline bool -can_extend(enum PL_OP_TYPE op, enum pool_comp_state state) -{ - if (op != PL_PLACE_EXTENDED) - return false; - if (state != PO_COMP_ST_UP && state != PO_COMP_ST_DRAIN) - return false; - return true; -} - /** * This functions finds the pairwise differences in the two layouts provided * and appends them into the d_list provided. The function appends the targets @@ -111,7 +91,28 @@ layout_find_diff(struct pl_jump_map *jmap, struct pl_obj_layout *original, if (reint_tgt != original_target) { pool_map_find_target(jmap->jmp_map.pl_poolmap, reint_tgt, &temp_tgt); - remap_alloc_one(diff, index, temp_tgt, true); + if (pool_target_avail(temp_tgt, PO_COMP_ST_UPIN | + PO_COMP_ST_UP | + PO_COMP_ST_DRAIN | + PO_COMP_ST_NEW)) + remap_alloc_one(diff, index, temp_tgt, true); + else + /* XXX: This isn't desirable - but it can happen + * when a reintegration is happening when + * something else fails. Placement will do a + * pass to determine what failed (good), and + * then do another pass to figure out where + * things moved to. But that 2nd pass will + * re-find failed things, and this diff function + * will cause the failed targets to be re-added + * to the layout as rebuilding. This should be + * removed when placement is able to handle + * this situation better + */ + D_DEBUG(DB_PL, + "skip remap %d to unavail tgt %u\n", + index, reint_tgt); + } } } @@ -211,8 +212,20 @@ pl_map2jmap(struct pl_map *map) return container_of(map, struct pl_jump_map, jmp_map); } +static void debug_print_allow_status(uint32_t allow_status) +{ + D_DEBUG(DB_PL, "Allow status: [%s%s%s%s%s%s%s ]\n", + allow_status & PO_COMP_ST_UNKNOWN ? " UNKNOWN" : "", + allow_status & PO_COMP_ST_NEW ? " NEW" : "", + allow_status & PO_COMP_ST_UP ? " UP" : "", + allow_status & PO_COMP_ST_UPIN ? " UPIN" : "", + allow_status & PO_COMP_ST_DOWN ? " DOWN" : "", + allow_status & PO_COMP_ST_DOWNOUT ? " DOWNOUT" : "", + allow_status & PO_COMP_ST_DRAIN ? " DRAIN" : ""); +} + static inline uint32_t -get_num_domains(struct pool_domain *curr_dom, enum PL_OP_TYPE op_type) +get_num_domains(struct pool_domain *curr_dom, uint32_t allow_status) { struct pool_domain *next_dom; struct pool_target *next_target; @@ -224,7 +237,7 @@ get_num_domains(struct pool_domain *curr_dom, enum PL_OP_TYPE op_type) else num_dom = curr_dom->do_child_nr; - if (op_type == PL_ADD) + if (allow_status & PO_COMP_ST_NEW) return num_dom; if (curr_dom->do_children != NULL) { @@ -281,7 +294,7 @@ get_num_domains(struct pool_domain *curr_dom, enum PL_OP_TYPE op_type) static void get_target(struct pool_domain *curr_dom, struct pool_target **target, uint64_t obj_key, uint8_t *dom_used, uint8_t *dom_occupied, - uint8_t *tgts_used, int shard_num, enum PL_OP_TYPE op_type) + uint8_t *tgts_used, int shard_num, uint32_t allow_status) { int range_set; uint8_t found_target = 0; @@ -296,7 +309,7 @@ get_target(struct pool_domain *curr_dom, struct pool_target **target, uint32_t num_doms; /* Retrieve number of nodes in this domain */ - num_doms = get_num_domains(curr_dom, op_type); + num_doms = get_num_domains(curr_dom, allow_status); /* If choosing target (lowest fault domain level) */ if (curr_dom->do_children == NULL) { @@ -409,7 +422,6 @@ get_target(struct pool_domain *curr_dom, struct pool_target **target, } while (!found_target); } - uint32_t count_available_spares(struct pl_jump_map *jmap, struct pl_obj_layout *layout, uint32_t failed_in_layout) @@ -452,9 +464,9 @@ count_available_spares(struct pl_jump_map *jmap, struct pl_obj_layout *layout, static int obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, struct pl_obj_layout *layout, struct jm_obj_placement *jmop, - d_list_t *remap_list, enum PL_OP_TYPE op_type, + d_list_t *remap_list, uint32_t allow_status, uint8_t *tgts_used, uint8_t *dom_used, uint8_t *dom_occupied, - uint32_t failed_in_layout, d_list_t *extend_list) + uint32_t failed_in_layout, bool *is_extending) { struct failed_shard *f_shard; struct pl_obj_shard *l_shard; @@ -463,7 +475,6 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, d_list_t *current; daos_obj_id_t oid; bool spare_avail = true; - bool for_reint; uint64_t key; uint32_t spares_left; int rc; @@ -471,7 +482,6 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, remap_dump(remap_list, md, "remap:"); - for_reint = (op_type == PL_REINT); current = remap_list->next; spare_tgt = NULL; oid = md->omd_id; @@ -491,6 +501,7 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, l_shard = &layout->ol_shards[f_shard->fs_shard_idx]; D_DEBUG(DB_PL, "Attempting to remap failed shard: " DF_FAILEDSHARD"\n", DP_FAILEDSHARD(*f_shard)); + debug_print_allow_status(allow_status); /* * If there are any targets left, there are potentially valid @@ -505,23 +516,18 @@ obj_remap_shards(struct pl_jump_map *jmap, struct daos_obj_md *md, rebuild_key = crc(key, f_shard->fs_shard_idx); get_target(root, &spare_tgt, crc(key, rebuild_key), dom_used, dom_occupied, tgts_used, - shard_id, op_type); + shard_id, allow_status); D_ASSERT(spare_tgt != NULL); D_DEBUG(DB_PL, "Trying new target: "DF_TARGET"\n", DP_TARGET(spare_tgt)); spares_left--; } - determine_valid_spares(spare_tgt, md, spare_avail, - ¤t, remap_list, for_reint, f_shard, - l_shard); + determine_valid_spares(spare_tgt, md, spare_avail, ¤t, + remap_list, allow_status, f_shard, + l_shard, is_extending); } - if (op_type == PL_PLACE_EXTENDED) { - rc = pl_map_extend(layout, extend_list); - if (rc != 0) - return rc; - } return 0; } @@ -544,7 +550,6 @@ jump_map_obj_spec_place_get(struct pl_jump_map *jmap, daos_obj_id_t oid, *target = &(tgts[pos]); - rc = pool_map_find_domain(jmap->jmp_map.pl_poolmap, PO_COMP_TP_ROOT, PO_COMP_ID_ALL, &root); D_ASSERT(rc == 1); @@ -589,45 +594,43 @@ jump_map_obj_spec_place_get(struct pl_jump_map *jmap, daos_obj_id_t oid, * \param[in] jmap The placement map used for this placement. * \param[in] jmop The layout group size and count. * \param[in] md Object metadata. + * \param[in] allow_status target status allowed to be in the layout. * \param[out] layout This will contain the layout for the object - * \param[out] remap_list This will contain the targets that need to + * \param[out] out_list This will contain the targets that need to * be rebuilt and in the case of rebuild, may be * returned during the rebuild process. + * \param[out] is_extending if there is drain/extending/reintegrating tgts + * exists in this layout, which we might need + * insert extra shards into the layout. * * \return An error code determining if the function * succeeded (0) or failed. */ static int get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, - struct jm_obj_placement *jmop, d_list_t *remap_list, - enum PL_OP_TYPE op_type, struct daos_obj_md *md) + struct jm_obj_placement *jmop, d_list_t *out_list, + uint32_t allow_status, struct daos_obj_md *md, + bool *is_extending) { struct pool_target *target; struct pool_domain *root; daos_obj_id_t oid; - d_list_t extend_list; uint8_t *dom_used = NULL; uint8_t *dom_occupied = NULL; uint8_t *tgts_used = NULL; - uint32_t dom_used_length; + uint32_t dom_size; uint64_t key; - uint32_t fail_tgt_cnt; - bool for_reint; - enum pool_comp_state state; - int i, j, k, rc; + uint32_t fail_tgt_cnt = 0; + bool spec_oid = false; + d_list_t local_list; + d_list_t *remap_list; + int i, j, k; + int rc = 0; /* Set the pool map version */ layout->ol_ver = pl_map_version(&(jmap->jmp_map)); D_DEBUG(DB_PL, "Building layout. map version: %d\n", layout->ol_ver); - - j = 0; - k = 0; - fail_tgt_cnt = 0; - oid = md->omd_id; - key = oid.hi ^ oid.lo; - target = NULL; - for_reint = (op_type == PL_REINT); - D_DEBUG(DB_PL, "for_reint: %s", for_reint ? "Yes" : "No"); + debug_print_allow_status(allow_status); rc = pool_map_find_domain(jmap->jmp_map.pl_poolmap, PO_COMP_TP_ROOT, PO_COMP_ID_ALL, &root); @@ -635,64 +638,51 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, D_ERROR("Could not find root node in pool map."); return -DER_NONEXIST; } + rc = 0; - dom_used_length = (struct pool_domain *)(root->do_targets) - (root) + 1; - - D_ALLOC_ARRAY(dom_used, (dom_used_length / 8) + 1); - D_ALLOC_ARRAY(dom_occupied, (dom_used_length / 8) + 1); - D_ALLOC_ARRAY(tgts_used, (root->do_target_nr / 8) + 1); - D_INIT_LIST_HEAD(&extend_list); + if (out_list != NULL) { + remap_list = out_list; + } else { + D_INIT_LIST_HEAD(&local_list); + remap_list = &local_list; + } + dom_size = (struct pool_domain *)(root->do_targets) - (root) + 1; + D_ALLOC_ARRAY(dom_used, (dom_size / NBBY) + 1); + D_ALLOC_ARRAY(dom_occupied, (dom_size / NBBY) + 1); + D_ALLOC_ARRAY(tgts_used, (root->do_target_nr / NBBY) + 1); if (dom_used == NULL || dom_occupied == NULL || tgts_used == NULL) D_GOTO(out, rc = -DER_NOMEM); - /** - * If the object class is a special class then the first shard must be - * hand picked because there is no other way to specify a starting - * location. - */ - if (daos_obj_is_srank(oid)) { - rc = jump_map_obj_spec_place_get(jmap, oid, &target, dom_used, - dom_used_length); - if (rc) { - D_ERROR("special oid "DF_OID" failed: rc %d\n", - DP_OID(oid), rc); - D_GOTO(out, rc); - } - - layout->ol_shards[0].po_target = target->ta_comp.co_id; - layout->ol_shards[0].po_shard = 0; - layout->ol_shards[0].po_fseq = target->ta_comp.co_fseq; - setbit(tgts_used, target->ta_comp.co_id); - - if (pool_target_unavail(target, for_reint)) { - fail_tgt_cnt++; - state = target->ta_comp.co_status; - rc = remap_alloc_one(remap_list, 0, target, false); - if (rc) - D_GOTO(out, rc); - if (can_extend(op_type, state)) { - rc = remap_alloc_one(&extend_list, k, target, - true); - if (rc != 0) + oid = md->omd_id; + key = oid.hi ^ oid.lo; + if (daos_obj_is_srank(oid)) + spec_oid = true; + + for (i = 0, k = 0; i < jmop->jmop_grp_nr; i++) { + for (j = 0; j < jmop->jmop_grp_size; j++, k++) { + target = NULL; + if (spec_oid && i == 0 && j == 0) { + /** + * If the object class is a special class then + * the first shard must be picked specially. + */ + rc = jump_map_obj_spec_place_get(jmap, oid, + &target, + dom_used, + dom_size); + if (rc) { + D_ERROR("special oid "DF_OID + " failed: rc %d\n", + DP_OID(oid), rc); D_GOTO(out, rc); + } + setbit(tgts_used, target->ta_comp.co_id); + } else { + get_target(root, &target, key, dom_used, + dom_occupied, tgts_used, k, + allow_status); } - } - - /** skip the first shard because it's been - * determined by Obj class - */ - j = 1; - k = 1; - } - for (i = 0; i < jmop->jmop_grp_nr; i++) { - - for (; j < jmop->jmop_grp_size; j++, k++) { - uint32_t tgt_id; - uint32_t fseq; - - get_target(root, &target, key, dom_used, dom_occupied, - tgts_used, k, op_type); if (target == NULL) { D_DEBUG(DB_PL, "no targets for %d/%d/%d\n", @@ -702,52 +692,44 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, layout->ol_shards[k].po_fseq = 0; continue; } - - tgt_id = target->ta_comp.co_id; - fseq = target->ta_comp.co_fseq; - - layout->ol_shards[k].po_target = tgt_id; + layout->ol_shards[k].po_target = + target->ta_comp.co_id; + layout->ol_shards[k].po_fseq = + target->ta_comp.co_fseq; layout->ol_shards[k].po_shard = k; - layout->ol_shards[k].po_fseq = fseq; /** If target is failed queue it for remap*/ - if (pool_target_unavail(target, for_reint)) { - D_DEBUG(DB_PL, "Target unavailable " DF_TARGET - ". Adding to remap_list:\n", - DP_TARGET(target)); + if (!pool_target_avail(target, allow_status)) { fail_tgt_cnt++; - state = target->ta_comp.co_status; + D_DEBUG(DB_PL, "Target unavailable " DF_TARGET + ". Adding to remap_list: fail cnt %d\n", + DP_TARGET(target), fail_tgt_cnt); rc = remap_alloc_one(remap_list, k, target, - false); + false); if (rc) D_GOTO(out, rc); - if (can_extend(op_type, state)) { - D_DEBUG(DB_PL, "Adding "DF_TARGET" to" - " extend_list\n", - DP_TARGET(target)); - remap_alloc_one(&extend_list, k, - target, true); - } + if (is_extending != NULL && + (target->ta_comp.co_status == + PO_COMP_ST_UP || + target->ta_comp.co_status == + PO_COMP_ST_DRAIN)) + *is_extending = true; } } - - j = 0; } - rc = 0; - D_DEBUG(DB_PL, "Fail tgt cnt: %d\n", fail_tgt_cnt); if (fail_tgt_cnt > 0) rc = obj_remap_shards(jmap, md, layout, jmop, remap_list, - op_type, tgts_used, dom_used, - dom_occupied, fail_tgt_cnt, - &extend_list); + allow_status, tgts_used, dom_used, + dom_occupied, fail_tgt_cnt, is_extending); out: - if (rc) { + if (rc) D_ERROR("jump_map_obj_layout_fill failed, rc "DF_RC"\n", DP_RC(rc)); - remap_list_free_all(remap_list); - } + if (remap_list == &local_list) + remap_list_free_all(&local_list); + if (dom_used) D_FREE(dom_used); if (dom_occupied) @@ -758,6 +740,42 @@ get_object_layout(struct pl_jump_map *jmap, struct pl_obj_layout *layout, return rc; } +static int +obj_layout_alloc_and_get(struct pl_jump_map *jmap, + struct jm_obj_placement *jmop, struct daos_obj_md *md, + uint32_t allow_status, struct pl_obj_layout **layout_p, + d_list_t *remap_list, bool *is_extending) +{ + int rc; + + /* Allocate space to hold the layout */ + D_ASSERT(jmop->jmop_grp_size > 0); + D_ASSERT(jmop->jmop_grp_nr > 0); + rc = pl_obj_layout_alloc(jmop->jmop_grp_size, jmop->jmop_grp_nr, + layout_p); + if (rc) { + D_ERROR("pl_obj_layout_alloc failed, rc "DF_RC"\n", + DP_RC(rc)); + return rc; + } + + rc = get_object_layout(jmap, *layout_p, jmop, remap_list, allow_status, + md, is_extending); + if (rc) { + D_ERROR("get object layout failed, rc "DF_RC"\n", + DP_RC(rc)); + D_GOTO(out, rc); + } + +out: + if (rc != 0) { + if (*layout_p != NULL) + pl_obj_layout_free(*layout_p); + *layout_p = NULL; + } + return rc; +} + /** * Frees the placement map * @@ -873,20 +891,21 @@ jump_map_obj_place(struct pl_map *map, struct daos_obj_md *md, struct pl_obj_layout **layout_pp) { struct pl_jump_map *jmap; - struct pl_obj_layout *layout; - struct pl_obj_layout *add_layout = NULL; + struct pl_obj_layout *layout = NULL; + struct pl_obj_layout *extend_layout = NULL; struct jm_obj_placement jmop; - struct pool_domain *root; - d_list_t remap_list; - d_list_t add_list; + d_list_t extend_list; + bool is_extending = false; + bool is_adding_new = false; daos_obj_id_t oid; + struct pool_domain *root; + uint32_t allow_status; int rc; - D_DEBUG(DB_PL, "Determining location for object: "DF_OID", ver: %d\n", - DP_OID(md->omd_id), md->omd_ver); - jmap = pl_map2jmap(map); oid = md->omd_id; + D_DEBUG(DB_PL, "Determining location for object: "DF_OID", ver: %d\n", + DP_OID(oid), md->omd_ver); rc = jm_obj_placement_get(jmap, md, shard_md, &jmop); if (rc) { @@ -894,66 +913,71 @@ jump_map_obj_place(struct pl_map *map, struct daos_obj_md *md, return rc; } - /* Allocate space to hold the layout */ - rc = pl_obj_layout_alloc(jmop.jmop_grp_size, jmop.jmop_grp_nr, - &layout); - if (rc) { - D_ERROR("pl_obj_layout_alloc failed, rc "DF_RC"\n", DP_RC(rc)); - return rc; - } - - D_INIT_LIST_HEAD(&remap_list); - rc = get_object_layout(jmap, layout, &jmop, &remap_list, - PL_PLACE_EXTENDED, md); + D_INIT_LIST_HEAD(&extend_list); + allow_status = PO_COMP_ST_UPIN; + rc = obj_layout_alloc_and_get(jmap, &jmop, md, allow_status, &layout, + NULL, &is_extending); if (rc != 0) { D_ERROR("get_layout_alloc failed, rc "DF_RC"\n", DP_RC(rc)); - pl_obj_layout_free(layout); - return rc; + D_GOTO(out, rc); } - /* Needed to check if domains are being added to pool map */ - rc = pool_map_find_domain(jmap->jmp_map.pl_poolmap, PO_COMP_TP_ROOT, - PO_COMP_ID_ALL, &root); - D_ASSERT(rc == 1); - if (is_pool_adding(root)) { - /* Allocate space to hold the layout */ - rc = pl_obj_layout_alloc(jmop.jmop_grp_size, jmop.jmop_grp_nr, - &add_layout); - if (rc) { - D_ERROR("pl_obj_layout_alloc failed, rc "DF_RC"\n", - DP_RC(rc)); - goto out; - } + obj_layout_dump(oid, layout); - remap_list_free_all(&remap_list); - D_INIT_LIST_HEAD(&remap_list); + rc = pool_map_find_domain(jmap->jmp_map.pl_poolmap, + PO_COMP_TP_ROOT, PO_COMP_ID_ALL, + &root); + D_ASSERT(rc == 1); + rc = 0; + if (is_pool_adding(root)) + is_adding_new = true; - rc = get_object_layout(jmap, add_layout, &jmop, &remap_list, - PL_ADD, md); - assert(rc == 0); - D_INIT_LIST_HEAD(&add_list); - layout_find_diff(jmap, layout, add_layout, &add_list); + /* If the layout might being extended, i.e. so extra shards needs + * to be added to the layout. + */ + if (unlikely(is_extending || is_adding_new)) { + /* Needed to check if domains are being added to pool map */ + D_DEBUG(DB_PL, DF_OID"/%d is being extended.\n", + DP_OID(oid), md->omd_ver); + if (is_adding_new) + allow_status |= PO_COMP_ST_NEW; + else + allow_status |= PO_COMP_ST_UP | PO_COMP_ST_DRAIN; + + /* Don't repeat remapping failed shards during this phase - + * they have already been remapped. + */ + allow_status |= PO_COMP_ST_DOWN; + rc = obj_layout_alloc_and_get(jmap, &jmop, md, allow_status, + &extend_layout, NULL, NULL); + if (rc) + D_GOTO(out, rc); - if (!d_list_empty(&add_list)) - rc = pl_map_extend(layout, &add_list); + obj_layout_dump(oid, extend_layout); + layout_find_diff(jmap, layout, extend_layout, &extend_list); + if (!d_list_empty(&extend_list)) { + rc = pl_map_extend(layout, &extend_list); + if (rc) + D_GOTO(out, rc); + } + obj_layout_dump(oid, layout); } + + *layout_pp = layout; out: - remap_list_free_all(&remap_list); + remap_list_free_all(&extend_list); - if (add_layout != NULL) - pl_obj_layout_free(add_layout); + if (extend_layout != NULL) + pl_obj_layout_free(extend_layout); if (rc < 0) { D_ERROR("Could not generate placement layout, rc "DF_RC"\n", DP_RC(rc)); - pl_obj_layout_free(layout); - return rc; + if (layout != NULL) + pl_obj_layout_free(layout); } - *layout_pp = layout; - obj_layout_dump(oid, layout); - - return DER_SUCCESS; + return rc; } /** @@ -990,7 +1014,7 @@ jump_map_obj_find_rebuild(struct pl_map *map, struct daos_obj_md *md, int idx = 0; - D_DEBUG(DB_PL, "Finding Rebuild\n"); + D_DEBUG(DB_PL, "Finding Rebuild at version: %u\n", rebuild_ver); /* Caller should guarantee the pl_map is up-to-date */ if (pl_map_version(map) < rebuild_ver) { @@ -1008,32 +1032,20 @@ jump_map_obj_find_rebuild(struct pl_map *map, struct daos_obj_md *md, return rc; } - /* Allocate space to hold the layout */ - rc = pl_obj_layout_alloc(jmop.jmop_grp_size, jmop.jmop_grp_nr, - &layout); - if (rc) { - D_ERROR("pl_obj_layout_alloc failed, rc "DF_RC"\n", DP_RC(rc)); - return rc; - } - D_INIT_LIST_HEAD(&remap_list); - rc = get_object_layout(jmap, layout, &jmop, &remap_list, PL_REBUILD, - md); - - if (rc < 0) { - D_ERROR("Could not generate placement layout, rc "DF_RC"\n", - DP_RC(rc)); - goto out; - } + rc = obj_layout_alloc_and_get(jmap, &jmop, md, PO_COMP_ST_UPIN, &layout, + &remap_list, NULL); + if (rc < 0) + D_GOTO(out, rc); obj_layout_dump(oid, layout); - rc = remap_list_fill(map, md, shard_md, rebuild_ver, tgt_id, shard_idx, array_size, &idx, layout, &remap_list, false); out: remap_list_free_all(&remap_list); - pl_obj_layout_free(layout); + if (layout != NULL) + pl_obj_layout_free(layout); return rc < 0 ? rc : idx; } @@ -1044,16 +1056,16 @@ jump_map_obj_find_reint(struct pl_map *map, struct daos_obj_md *md, uint32_t *shard_id, unsigned int array_size) { struct pl_jump_map *jmap; - struct pl_obj_layout *layout; - struct pl_obj_layout *reint_layout; - d_list_t remap_list; - d_list_t reint_list; + struct pl_obj_layout *layout = NULL; + struct pl_obj_layout *reint_layout = NULL; + d_list_t reint_list; struct jm_obj_placement jop; + uint32_t allow_status; int rc; int idx = 0; - D_DEBUG(DB_PL, "Finding Rebuild\n"); + D_DEBUG(DB_PL, "Finding Reint at version: %u\n", reint_ver); /* Caller should guarantee the pl_map is up-to-date */ if (pl_map_version(map) < reint_ver) { @@ -1063,41 +1075,27 @@ jump_map_obj_find_reint(struct pl_map *map, struct daos_obj_md *md, } jmap = pl_map2jmap(map); - rc = jm_obj_placement_get(jmap, md, shard_md, &jop); if (rc) { D_ERROR("jm_obj_placement_get failed, rc %d.\n", rc); return rc; } - /* Allocate space to hold the layout */ - rc = pl_obj_layout_alloc(jop.jmop_grp_size, jop.jmop_grp_nr, - &layout); - if (rc) - return 0; - - rc = pl_obj_layout_alloc(jop.jmop_grp_size, jop.jmop_grp_nr, - &reint_layout); - if (rc) - goto out; - - D_INIT_LIST_HEAD(&remap_list); + /* Ignore DOWN and DRAIN objects here - this API is only for finding + * reintegration candidates + */ + allow_status = PO_COMP_ST_UPIN | PO_COMP_ST_DOWN | PO_COMP_ST_DRAIN; D_INIT_LIST_HEAD(&reint_list); + rc = obj_layout_alloc_and_get(jmap, &jop, md, allow_status, &layout, + NULL, NULL); + if (rc < 0) + D_GOTO(out, rc); - /* Get original placement */ - rc = get_object_layout(jmap, layout, &jop, &remap_list, PL_PLACE, md); - if (rc) - goto out; - - /* Clear list for next placement operation. */ - remap_list_free_all(&remap_list); - D_INIT_LIST_HEAD(&remap_list); - - /* Get placement after reintegration. */ - rc = get_object_layout(jmap, reint_layout, &jop, &remap_list, PL_REINT, - md); - if (rc) - goto out; + allow_status |= PO_COMP_ST_UP; + rc = obj_layout_alloc_and_get(jmap, &jop, md, allow_status, + &reint_layout, NULL, NULL); + if (rc < 0) + D_GOTO(out, rc); layout_find_diff(jmap, layout, reint_layout, &reint_list); @@ -1106,8 +1104,6 @@ jump_map_obj_find_reint(struct pl_map *map, struct daos_obj_md *md, false); out: remap_list_free_all(&reint_list); - remap_list_free_all(&remap_list); - if (layout != NULL) pl_obj_layout_free(layout); if (reint_layout != NULL) @@ -1123,15 +1119,14 @@ jump_map_obj_find_addition(struct pl_map *map, struct daos_obj_md *md, uint32_t *shard_id, unsigned int array_size) { struct pl_jump_map *jmap; - struct pl_obj_layout *layout; - struct pl_obj_layout *add_layout; - d_list_t remap_list; + struct pl_obj_layout *layout = NULL; + struct pl_obj_layout *add_layout = NULL; d_list_t add_list; struct jm_obj_placement jop; + uint32_t allow_status; + int idx = 0; int rc; - int idx = 0; - D_DEBUG(DB_PL, "Finding new layout for server addition\n"); /* Caller should guarantee the pl_map is up-to-date */ @@ -1149,41 +1144,24 @@ jump_map_obj_find_addition(struct pl_map *map, struct daos_obj_md *md, return rc; } - /* Allocate space to hold the layout */ - rc = pl_obj_layout_alloc(jop.jmop_grp_size, jop.jmop_grp_nr, &layout); - if (rc) - return rc; - - D_INIT_LIST_HEAD(&remap_list); + allow_status = PO_COMP_ST_UPIN; D_INIT_LIST_HEAD(&add_list); - - rc = pl_obj_layout_alloc(jop.jmop_grp_size, jop.jmop_grp_nr, - &add_layout); - if (rc) - goto out; - - /* Get original placement */ - rc = get_object_layout(jmap, layout, &jop, &remap_list, PL_PLACE, md); + rc = obj_layout_alloc_and_get(jmap, &jop, md, allow_status, + &layout, NULL, NULL); if (rc) - goto out; + D_GOTO(out, rc); - /* Clear list for next placement operation. */ - remap_list_free_all(&remap_list); - D_INIT_LIST_HEAD(&remap_list); - - /* Get placement after server addition. */ - rc = get_object_layout(jmap, add_layout, &jop, &remap_list, PL_ADD, - md); + allow_status |= PO_COMP_ST_NEW; + rc = obj_layout_alloc_and_get(jmap, &jop, md, allow_status, + &add_layout, NULL, NULL); if (rc) - goto out; + D_GOTO(out, rc); layout_find_diff(jmap, layout, add_layout, &add_list); - rc = remap_list_fill(map, md, shard_md, reint_ver, tgt_rank, shard_id, array_size, &idx, add_layout, &add_list, true); out: remap_list_free_all(&add_list); - remap_list_free_all(&remap_list); if (layout != NULL) pl_obj_layout_free(layout); diff --git a/src/placement/pl_map.c b/src/placement/pl_map.c index ca985ccea2c..cb49a0c5049 100644 --- a/src/placement/pl_map.c +++ b/src/placement/pl_map.c @@ -199,7 +199,7 @@ pl_obj_layout_free(struct pl_obj_layout *layout) /* Returns whether or not a given layout contains the specified rank */ bool pl_obj_layout_contains(struct pool_map *map, struct pl_obj_layout *layout, - uint32_t rank, uint32_t target_index) + uint32_t rank, uint32_t target_index, uint32_t id_shard) { struct pool_target *target; int i; @@ -211,7 +211,7 @@ pl_obj_layout_contains(struct pool_map *map, struct pl_obj_layout *layout, rc = pool_map_find_target(map, layout->ol_shards[i].po_target, &target); if (rc != 0 && target->ta_comp.co_rank == rank && - target->ta_comp.co_index == target_index) + target->ta_comp.co_index == target_index && i == id_shard) return true; /* Found a target and rank matches */ } diff --git a/src/placement/pl_map.h b/src/placement/pl_map.h index f6a14f23c2d..c2d88daad72 100644 --- a/src/placement/pl_map.h +++ b/src/placement/pl_map.h @@ -119,9 +119,9 @@ remap_list_fill(struct pl_map *map, struct daos_obj_md *md, void determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, bool spare_avail, d_list_t **current, - d_list_t *remap_list, bool for_reint, + d_list_t *remap_list, uint32_t allow_status, struct failed_shard *f_shard, - struct pl_obj_shard *l_shard); + struct pl_obj_shard *l_shard, bool *extending); int spec_place_rank_get(unsigned int *pos, daos_obj_id_t oid, diff --git a/src/placement/pl_map_common.c b/src/placement/pl_map_common.c index a52b35ea9a6..f15f0430a5a 100644 --- a/src/placement/pl_map_common.c +++ b/src/placement/pl_map_common.c @@ -94,10 +94,12 @@ inline void remap_list_free_all(d_list_t *remap_list) { struct failed_shard *f_shard; + struct failed_shard *tmp; - while ((f_shard = d_list_pop_entry(remap_list, struct failed_shard, - fs_list))) + d_list_for_each_entry_safe(f_shard, tmp, remap_list, fs_list) { + d_list_del(&f_shard->fs_list); D_FREE(f_shard); + } } /** dump remap list, for debug only */ @@ -232,8 +234,8 @@ remap_list_fill(struct pl_map *map, struct daos_obj_md *md, void determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, bool spare_avail, d_list_t **current, d_list_t *remap_list, - bool for_reint, struct failed_shard *f_shard, - struct pl_obj_shard *l_shard) + uint32_t allow_status, struct failed_shard *f_shard, + struct pl_obj_shard *l_shard, bool *is_extending) { struct failed_shard *f_tmp; @@ -241,7 +243,7 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, goto next_fail; /* The selected spare target is down as well */ - if (pool_target_unavail(spare_tgt, for_reint)) { + if (!pool_target_avail(spare_tgt, allow_status)) { D_ASSERTF(spare_tgt->ta_comp.co_fseq != f_shard->fs_fseq, "same fseq %u!\n", f_shard->fs_fseq); @@ -294,6 +296,10 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, D_DEBUG(DB_PL, "failed shard ("DF_FAILEDSHARD") added to " "remamp_list\n", DP_FAILEDSHARD(*f_shard)); remap_add_one(remap_list, f_shard); + if (is_extending != NULL && + (spare_tgt->ta_comp.co_status == PO_COMP_ST_UP || + spare_tgt->ta_comp.co_status == PO_COMP_ST_DRAIN)) + *is_extending = true; /* Continue with the failed shard has minimal fseq */ if ((*current) == remap_list) { @@ -310,6 +316,7 @@ determine_valid_spares(struct pool_target *spare_tgt, struct daos_obj_md *md, spare_tgt->ta_comp.co_fseq); return; /* try next spare */ } + next_fail: if (spare_avail) { /* The selected spare target is up and ready */ @@ -468,7 +475,6 @@ pl_map_extend(struct pl_obj_layout *layout, d_list_t *extended_list) D_FREE(grp_map); if (grp_count != grp_cnt_array && grp_count != NULL) D_FREE(grp_count); - remap_list_free_all(extended_list); return rc; } @@ -477,7 +483,8 @@ is_pool_adding(struct pool_domain *dom) { uint32_t child_nr; - while (dom->do_children && dom->do_comp.co_status != PO_COMP_ST_NEW) { + while (dom->do_children && + dom->do_comp.co_status != PO_COMP_ST_NEW) { child_nr = dom->do_child_nr; dom = &dom->do_children[child_nr - 1]; } diff --git a/src/placement/ring_map.c b/src/placement/ring_map.c index b0877a62156..c57bbfd57a5 100644 --- a/src/placement/ring_map.c +++ b/src/placement/ring_map.c @@ -1022,7 +1022,8 @@ ring_obj_remap_shards(struct pl_ring_map *rimap, struct daos_obj_md *md, spare_tgt = &tgts[plts[spare_idx].pt_pos]; determine_valid_spares(spare_tgt, md, spare_avail, ¤t, - remap_list, for_reint, f_shard, l_shard); + remap_list, for_reint, f_shard, l_shard, + NULL); } remap_dump(remap_list, md, "after remap:"); diff --git a/src/placement/tests/jump_map_place_obj.c b/src/placement/tests/jump_map_place_obj.c index 5d79646bfa7..43730c09b52 100644 --- a/src/placement/tests/jump_map_place_obj.c +++ b/src/placement/tests/jump_map_place_obj.c @@ -921,6 +921,7 @@ jtc_snapshot_layout_targets(struct jm_test_ctx *ctx) } while (0) #define UP POOL_REINT +#define UPIN POOL_ADD_IN #define DOWN POOL_EXCLUDE #define DOWNOUT POOL_EXCLUDE_OUT #define DRAIN POOL_DRAIN @@ -1242,10 +1243,23 @@ down_back_to_up_in_same_order(void **state) jtc_set_status_on_target(&ctx, UP, orig_shard_targets[0]); jtc_assert_scan_and_layout(&ctx); - jtc_fini(&ctx); - skip_msg("DAOS-6519: too many things are in the reint scan"); - assert_int_equal(1, ctx.reint.out_nr); - jtc_assert_rebuild_reint_new(ctx, 1, 0, 1, 0); + /* NOTE: This is a really important test case. Even though this test + * seems like it should only move one shard (because only one target is + * being reintegrated), this particular combination happens to trigger + * extra data movement, resulting in two shards moving - one moving back + * to the reintegrated target, and one moving between two otherwise + * healthy targets because of the retry/collision mechanism of the jump + * map algorithm. + * + * XXX This will likely break if the jump consistent hashing algorithm + * is changed. It's just fortunate we happened to trigger this somewhat + * rare case here. If you are reading this later and you find this + * assert triggering because the value is 1 instead of 2, likely the + * placement algorithm was modified so that this test no longer hits + * this corner case. + */ + assert_int_equal(2, ctx.reint.out_nr); + jtc_assert_rebuild_reint_new(ctx, 2, 0, 2, 0); /* Take second downed target up */ jtc_set_status_on_target(&ctx, UP, orig_shard_targets[1]); @@ -1404,8 +1418,6 @@ down_up_sequences1(void **state) jtc_set_status_on_target(&ctx, UP, shard_target_2); jtc_assert_scan_and_layout(&ctx); - jtc_fini(&ctx); - skip_msg("Investigation into DAOS-6519 is similar/same issue."); is_true(jtc_has_shard_moving_to_target(&ctx, 0, shard_target_2)); jtc_set_status_on_target(&ctx, UP, shard_target_1); @@ -1446,8 +1458,6 @@ drain_all_with_extra_domains(void **state) */ assert_int_equal(8, jtc_get_layout_target_count(&ctx)); - jtc_fini(&ctx); - skip_msg("DAOS-6300 - too many are marked as rebuild"); assert_int_equal(4, jtc_get_layout_rebuild_count(&ctx)); for (i = 0; i < shards_nr; i++) { is_true(jtc_has_shard_with_target_rebuilding(&ctx, i, NULL)); @@ -1478,8 +1488,6 @@ drain_all_with_enough_targets(void **state) * rebuilding and one not */ for (i = 0; i < shards_nr; i++) { - jtc_fini(&ctx); - skip_msg("DAOS-6300 - Not drained to other target?"); assert_int_equal(0, jtc_get_layout_bad_count(&ctx)); is_true(jtc_has_shard_with_target_rebuilding(&ctx, i, NULL)); is_true(jtc_has_shard_with_rebuilding_not_set(&ctx, i)); @@ -1510,8 +1518,6 @@ drain_target_same_shard_repeatedly_for_all_shards(void **state) is_true(jtc_has_shard_with_target_rebuilding(&ctx, shard_id, &new_target)); - jtc_fini(&ctx); - skip_msg("DAOS-6300: All are marked as rebuilding"); is_true(jtc_has_shard_target_not_rebuilding(&ctx, shard_id, target)); @@ -1564,8 +1570,6 @@ one_server_is_added(void **state) assert_int_equal(0, ctx.rebuild.out_nr); assert_int_equal(0, ctx.reint.out_nr); - jtc_fini(&ctx); - skip_msg("DAOS-6303 - should have targets marked as rebuild"); assert_int_equal(ctx.new.out_nr, jtc_get_layout_rebuild_count(&ctx)); jtc_fini(&ctx); @@ -1573,11 +1577,118 @@ one_server_is_added(void **state) /* * ------------------------------------------------ - * Leave in multiple states at same time + * Leave in multiple states at same time (no addition) * ------------------------------------------------ */ static void placement_handles_multiple_states(void **state) +{ + struct jm_test_ctx ctx; + int ver_after_reint; + int ver_after_fail; + int ver_after_drain; + int ver_after_reint_complete; + uint32_t reint_tgt_id; + uint32_t fail_tgt_id; + uint32_t rebuilding; + + jtc_init_with_layout(&ctx, 4, 1, 8, OC_RP_3G1, g_verbose); + + /* first shard goes down, rebuilt, then reintegrated */ + jtc_set_status_on_shard_target(&ctx, DOWN, 0); + jtc_set_status_on_shard_target(&ctx, DOWNOUT, 0); + jtc_set_status_on_shard_target(&ctx, UP, 0); + reint_tgt_id = jtc_layout_shard_tgt(&ctx, 0); + assert_success(jtc_create_layout(&ctx)); + + rebuilding = jtc_get_layout_rebuild_count(&ctx); + /* One thing reintegrating */ + assert_int_equal(1, rebuilding); + + /* + * Reintegration is now in progress. Grab the version from here + * for find reint count + */ + ver_after_reint = ctx.ver; + + /* second shard goes down */ + jtc_set_status_on_shard_target(&ctx, DOWN, 1); + fail_tgt_id = jtc_layout_shard_tgt(&ctx, 1); + assert_success(jtc_create_layout(&ctx)); + + ver_after_fail = ctx.ver; + + rebuilding = jtc_get_layout_rebuild_count(&ctx); + /* One reintegrating plus one failure recovery */ + assert_int_equal(2, rebuilding); + + /* third shard is queued for drain */ + jtc_set_status_on_shard_target(&ctx, DRAIN, 2); + assert_success(jtc_create_layout(&ctx)); + + /* + * Reintegration is still running, but these other operations have + * happened too and are now queued. + */ + ver_after_drain = ctx.ver; + + is_false(jtc_layout_has_duplicate(&ctx)); + + /* + * Compute placement in this state. All three shards should + * be moving around + */ + jtc_scan(&ctx); + rebuilding = jtc_get_layout_rebuild_count(&ctx); + assert_int_equal(3, rebuilding); + + /* + * Compute find_reint() using the correct version of rebuild which + * would have launched when reintegration started + * + * find_reint() should only be finding the one thing to move at this + * version + */ + ctx.ver = ver_after_reint; + jtc_scan(&ctx); + assert_int_equal(ctx.reint.out_nr, 1); + + /* Complete the reintegration */ + ctx.ver = ver_after_drain; /* Restore the version first */ + jtc_set_status_on_target(&ctx, UPIN, reint_tgt_id); + ver_after_reint_complete = ctx.ver; + + /* This would start processing the failure - so check that it'd just + * move one thing + */ + ctx.ver = ver_after_fail; + jtc_scan(&ctx); + assert_int_equal(ctx.rebuild.out_nr, 1); + + /* Complete the rebuild */ + ctx.ver = ver_after_reint_complete; /* Restore the version first */ + jtc_set_status_on_target(&ctx, DOWNOUT, fail_tgt_id); + + /* This would start processing the drain - so check that it'd just + * move one thing + */ + ctx.ver = ver_after_drain; + jtc_scan(&ctx); + assert_int_equal(ctx.rebuild.out_nr, 1); + + /* Remainder is simple / out of scope for this test */ + + jtc_fini(&ctx); +} + + +/* + * ------------------------------------------------ + * Leave in multiple states at same time (including addition) + * ------------------------------------------------ + */ +static void +placement_handles_multiple_states_with_addition(void **state) { struct jm_test_ctx ctx; @@ -1773,8 +1884,10 @@ static const struct CMUnitTest tests[] = { "data movement to the new server", one_server_is_added), /* Multiple */ - T("Placement can handle multiple states", + T("Placement can handle multiple states (excluding addition)", placement_handles_multiple_states), + T("Placement can handle multiple states (including addition)", + placement_handles_multiple_states_with_addition), /* Non-standard system setups*/ T("Non-standard system configurations. All healthy", unbalanced_config), diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 704d85fd192..aba3032f698 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -525,7 +525,8 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, D_GOTO(out, rc); still_needed = pl_obj_layout_contains(rpt->rt_pool->sp_map, - layout, myrank, mytarget); + layout, myrank, mytarget, + oid.id_shard); if (!still_needed) { struct rebuild_pool_tls *tls; diff --git a/src/tests/suite/daos_rebuild_simple.c b/src/tests/suite/daos_rebuild_simple.c index ef23f28c3c6..8d7e77f5944 100644 --- a/src/tests/suite/daos_rebuild_simple.c +++ b/src/tests/suite/daos_rebuild_simple.c @@ -914,8 +914,6 @@ rebuild_full_shards(void **state) struct ioreq req; int i; - skip(); /** DAOS-5758 */ - if (!test_runable(arg, 4)) return; From 6188da127179e761b45f6dc4cabb8e0f592a248f Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Wed, 24 Mar 2021 09:26:39 -0400 Subject: [PATCH 25/37] DAOS-6923 test: Merge with Di's branch. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 2 -- src/tests/ftest/util/osa_utils.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 3c7a3ba38aa..3c36cb53c72 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -9,7 +9,6 @@ from daos_utils import DaosCommand from test_utils_pool import TestPool from write_host_file import write_host_file -from apricot import skipForTicket class OSAOfflineReintegration(OSAUtils): @@ -185,7 +184,6 @@ def test_osa_offline_reintegrate_during_rebuild(self): '/run/rebuild/*') self.run_offline_reintegration_test(1, data=True) - @skipForTicket("DAOS-6925") def test_osa_offline_reintegration_oclass(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index fe76c51cde4..7a6e177450a 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -312,7 +312,7 @@ def run_ior_thread(self, action, oclass, test, def ior_thread(self, pool, oclass, test, flags, single_cont_read=True, fail_on_warning=True): - """Start threads and wait until all threads are finished. + """Start an IOR thread. Args: pool (object): pool handle From 49830190389ed07b3968b71933f8b7f48efba595 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Wed, 24 Mar 2021 17:39:10 -0400 Subject: [PATCH 26/37] DAOS-6923 test: Add skipForTicket-daos cont check Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 3c36cb53c72..53df10b3abf 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -9,6 +9,7 @@ from daos_utils import DaosCommand from test_utils_pool import TestPool from write_host_file import write_host_file +from apricot import skipForTicket class OSAOfflineReintegration(OSAUtils): @@ -89,14 +90,14 @@ def run_offline_reintegration_test(self, num_pool, data=False, output = self.dmg_command.pool_exclude(self.pool.uuid, rank[val]) # Check the IOR data after exclude - if data: + if data and (val == 0): self.run_ior_thread("Read", oclass, test_seq) else: output = self.dmg_command.system_stop(ranks=rank[val], force=True) self.print_and_assert_on_rebuild_failure(output) # Check the IOR data after system stop - if data: + if data and (val == 0): self.run_ior_thread("Read", oclass, test_seq) output = self.dmg_command.system_start(ranks=rank[val]) # Just try to reintegrate rank 5 @@ -142,11 +143,12 @@ def run_offline_reintegration_test(self, num_pool, data=False, if data: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread() - self.container = self.pool_cont_dict[self.pool][0] - kwargs = {"pool": self.pool.uuid, - "cont": self.container.uuid} - output = self.daos_command.container_check(**kwargs) - self.log.info(output) + if self.test_during_rebuild is True: + self.container = self.pool_cont_dict[self.pool][0] + kwargs = {"pool": self.pool.uuid, + "cont": self.container.uuid} + output = self.daos_command.container_check(**kwargs) + self.log.info(output) def test_osa_offline_reintegration_multiple_pools(self): """Test ID: DAOS-6923 @@ -169,6 +171,7 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) + @skipForTicket("DAOS-7042") def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while rebuild From 2e6461a8da5e6cdcc41b76f83cff99f46fd6b090 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Wed, 24 Mar 2021 20:09:10 -0400 Subject: [PATCH 27/37] DAOS-6923 test: Run all the tests including weekly Test-tag-hw-medium: hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 53df10b3abf..6e58041c90e 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -45,7 +45,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, data (bool) : whether pool has no data or to create some data in pool. Defaults to False. server_boot (bool) : Perform system stop/start on a rank. - Defults to False. + Defaults to False. oclass (str) : daos object class string (eg: "RP_2G8") """ # Create a pool From 9fe0d4b71447cd28ccb3d2c899ec8cf1b6ef4dcb Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 25 Mar 2021 13:20:56 -0400 Subject: [PATCH 28/37] DAOS-6923 test: Offline reintegration no checksum Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 13 ++++++ .../ftest/osa/osa_offline_reintegration.yaml | 2 + src/tests/ftest/util/osa_utils.py | 43 +++++++++++++++++-- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 6e58041c90e..e764396afda 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -150,6 +150,19 @@ def run_offline_reintegration_test(self, num_pool, data=False, output = self.daos_command.container_check(**kwargs) self.log.info(output) + def test_osa_offline_reintegration_without_checksum(self): + """Test ID: DAOS-6923 + Test Description: Validate Offline Reintegration + without enabling checksum in container properties. + + :avocado: tags=all,pr,daily_regression,hw,medium,ib2 + :avocado: tags=osa,offline_reintegration + :avocado: tags=offline_reintegration_without_csum + """ + self.test_with_checksum = self.params.get("checksum", + '/run/test_with_checksum/*') + self.run_offline_reintegration_test(1, data=True) + def test_osa_offline_reintegration_multiple_pools(self): """Test ID: DAOS-6923 Test Description: Validate Offline Reintegration diff --git a/src/tests/ftest/osa/osa_offline_reintegration.yaml b/src/tests/ftest/osa/osa_offline_reintegration.yaml index fe52612e1b7..eb9a6b6f58c 100644 --- a/src/tests/ftest/osa/osa_offline_reintegration.yaml +++ b/src/tests/ftest/osa/osa_offline_reintegration.yaml @@ -107,3 +107,5 @@ aggregation: test_with_aggregation: True rebuild: test_with_rebuild: True +checksum: + test_with_checksum: False \ No newline at end of file diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index 7a6e177450a..d1d56b319c6 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -48,6 +48,7 @@ def setUp(self): self.dmg_command.exit_status_exception = False self.test_during_aggregation = False self.test_during_rebuild = False + self.test_with_checksum = True @fail_on(CommandFailure) def get_pool_leader(self): @@ -224,6 +225,9 @@ def prepare_cont_ior_write_read(self, oclass, flags): if self.pool_cont_dict[self.pool][0] is None: self.add_container(self.pool, create=False) self.set_cont_class_properties(oclass) + if self.test_with_checksum is False: + rf_value = "rf:{}".format(self.get_object_replica_value - 1) + self.update_cont_properties(rf_value) self.container.create() self.pool_cont_dict[self.pool][0] = self.container self.pool_cont_dict[self.pool][1] = "Updated" @@ -235,6 +239,10 @@ def prepare_cont_ior_write_read(self, oclass, flags): # Write to the second container self.add_container(self.pool, create=False) self.set_cont_class_properties(oclass) + if self.test_with_checksum is False: + rf_value = "rf:{}".format( + self.get_object_replica_value - 1) + self.update_cont_properties(rf_value) self.container.create() self.pool_cont_dict[self.pool][2] = self.container self.pool_cont_dict[self.pool][3] = "Updated" @@ -254,9 +262,35 @@ def delete_extra_container(self, pool): extra_container.destroy() self.pool_cont_dict[pool][3] = None + def get_object_replica_value(self, oclass): + """ Get the object replica value for an object class. + + Args: + oclass (str): Object Class (eg: RP_2G1,etc) + + Returns: + value (int) : Object replica value + """ + value = 0 + if "_" in oclass: + replica_list = oclass.split("_") + value = replica_list[1][0] + else: + self.log.info("Wrong Object Class. Cannot split") + return int(value) + + def update_cont_properties(self, cont_prop): + """Update the existing container properties. + Args: + cont_prop (str): Replace existing cotainer properties + with new value + """ + self.container.properties.value = cont_prop + def set_cont_class_properties(self, oclass="S1"): """Update the container class to match the IOR object - class. Also, remove the redundancy factor for S type + class. Fix the rf factor based on object replica value. + Also, remove the redundancy factor for S type object class. Args: oclass (str, optional): Container object class to be set. @@ -266,10 +300,13 @@ def set_cont_class_properties(self, oclass="S1"): # Set the container properties properly for S!, S2 class. # rf should not be set to 1 for S type object class. x = re.search("^S\\d$", oclass) + prop = self.container.properties.value if x is not None: - prop = self.container.properties.value prop = prop.replace("rf:1", "rf:0") - self.container.properties.value = prop + else: + rf_value = "rf:{}".format(self.get_object_replica_value - 1) + prop = prop.replace("rf:1", rf_value) + self.container.properties.value = prop def run_ior_thread(self, action, oclass, test, single_cont_read=True, From 1ccb85e33f5af0e8095cdf6e7cdbf00f492aa878 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 25 Mar 2021 13:31:10 -0400 Subject: [PATCH 29/37] DAOS-6923 test: Fix spell check checkpatch issue. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/osa_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index d1d56b319c6..aa6fab5512b 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -282,7 +282,7 @@ def get_object_replica_value(self, oclass): def update_cont_properties(self, cont_prop): """Update the existing container properties. Args: - cont_prop (str): Replace existing cotainer properties + cont_prop (str): Replace existing container properties with new value """ self.container.properties.value = cont_prop From aa50cc51d20d5748fd39e32004fc58e564178b74 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Thu, 25 Mar 2021 13:58:27 -0400 Subject: [PATCH 30/37] DAOS-6923 test: skipforTicket DAOS-6807 Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index e764396afda..414c691c7b4 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -174,6 +174,7 @@ def test_osa_offline_reintegration_multiple_pools(self): """ self.run_offline_reintegration_test(5, data=True) + @skipForTicket("DAOS-6807") def test_osa_offline_reintegration_server_stop(self): """Test ID: DAOS-6748. From f11419b8b63a03c8ffd64be2651ac289dc17b7bc Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Fri, 26 Mar 2021 00:02:04 -0400 Subject: [PATCH 31/37] DAOS-6923 test: Testing without enabling checksum Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 4 ++-- src/tests/ftest/util/osa_utils.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 414c691c7b4..0b965496b49 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -159,8 +159,8 @@ def test_osa_offline_reintegration_without_checksum(self): :avocado: tags=osa,offline_reintegration :avocado: tags=offline_reintegration_without_csum """ - self.test_with_checksum = self.params.get("checksum", - '/run/test_with_checksum/*') + self.test_with_checksum = self.params.get("test_with_checksum", + '/run/checksum/*') self.run_offline_reintegration_test(1, data=True) def test_osa_offline_reintegration_multiple_pools(self): diff --git a/src/tests/ftest/util/osa_utils.py b/src/tests/ftest/util/osa_utils.py index aa6fab5512b..83997c047c8 100644 --- a/src/tests/ftest/util/osa_utils.py +++ b/src/tests/ftest/util/osa_utils.py @@ -226,7 +226,8 @@ def prepare_cont_ior_write_read(self, oclass, flags): self.add_container(self.pool, create=False) self.set_cont_class_properties(oclass) if self.test_with_checksum is False: - rf_value = "rf:{}".format(self.get_object_replica_value - 1) + tmp = self.get_object_replica_value(oclass) + rf_value = "rf:{}".format(tmp - 1) self.update_cont_properties(rf_value) self.container.create() self.pool_cont_dict[self.pool][0] = self.container @@ -240,8 +241,8 @@ def prepare_cont_ior_write_read(self, oclass, flags): self.add_container(self.pool, create=False) self.set_cont_class_properties(oclass) if self.test_with_checksum is False: - rf_value = "rf:{}".format( - self.get_object_replica_value - 1) + tmp = self.get_object_replica_value(oclass) + rf_value = "rf:{}".format(tmp - 1) self.update_cont_properties(rf_value) self.container.create() self.pool_cont_dict[self.pool][2] = self.container @@ -304,7 +305,8 @@ def set_cont_class_properties(self, oclass="S1"): if x is not None: prop = prop.replace("rf:1", "rf:0") else: - rf_value = "rf:{}".format(self.get_object_replica_value - 1) + tmp = self.get_object_replica_value(oclass) + rf_value = "rf:{}".format(tmp - 1) prop = prop.replace("rf:1", rf_value) self.container.properties.value = prop @@ -393,6 +395,10 @@ def run_mdtest_thread(self): if self.container is None: self.add_container(self.pool, create=False) self.set_cont_class_properties(self.mdtest_cmd.dfs_oclass) + if self.test_with_checksum is False: + tmp = self.get_object_replica_value(self.mdtest_cmd.dfs_oclass) + rf_value = "rf:{}".format(tmp - 1) + self.update_cont_properties(rf_value) self.container.create() job_manager = self.get_mdtest_job_manager_command(self.manager) job_manager.job.dfs_cont.update(self.container.uuid) From ff2148dfbff57a514c802b5a8606e1b2c2e34177 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Fri, 26 Mar 2021 00:29:40 -0400 Subject: [PATCH 32/37] DAOS-6923 test: Perform IOR read after excludes Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 0b965496b49..eaf4859914d 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -90,7 +90,7 @@ def run_offline_reintegration_test(self, num_pool, data=False, output = self.dmg_command.pool_exclude(self.pool.uuid, rank[val]) # Check the IOR data after exclude - if data and (val == 0): + if data: self.run_ior_thread("Read", oclass, test_seq) else: output = self.dmg_command.system_stop(ranks=rank[val], From 8349efaa14711196583a1434a0a7553f637871ec Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 28 Mar 2021 10:46:08 -0400 Subject: [PATCH 33/37] DAOS-6923 test: Enable daos cont check Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index eaf4859914d..0b0eb8684ba 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -185,7 +185,6 @@ def test_osa_offline_reintegration_server_stop(self): """ self.run_offline_reintegration_test(1, data=True, server_boot=True) - @skipForTicket("DAOS-7042") def test_osa_offline_reintegrate_during_rebuild(self): """Test ID: DAOS-6923 Test Description: Reintegrate rank while rebuild From 3787a556625874a7bc4090f449f0c16e02a9ae0b Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 28 Mar 2021 19:56:03 -0400 Subject: [PATCH 34/37] DAOS-6923 test: Seeing md_test failures. Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/mdtest_test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/util/mdtest_test_base.py b/src/tests/ftest/util/mdtest_test_base.py index e7efa6c8fb8..029989aa923 100755 --- a/src/tests/ftest/util/mdtest_test_base.py +++ b/src/tests/ftest/util/mdtest_test_base.py @@ -61,12 +61,12 @@ def execute_mdtest(self): # Run Mdtest self.run_mdtest(self.get_mdtest_job_manager_command(self.manager), self.processes) + + self.stop_dfuse() # reset self.container if dfs_destroy is True if self.mdtest_cmd.dfs_destroy is True: self.container = None - self.stop_dfuse() - def get_mdtest_job_manager_command(self, manager): """Get the MPI job manager command for Mdtest. From de979b5e9209645829f25dc48ca79ec42d76b257 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Sun, 28 Mar 2021 20:04:30 -0400 Subject: [PATCH 35/37] DAOS-6923 test: Enable daos cont check for tests Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/osa/osa_offline_reintegration.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 0b0eb8684ba..3ee27c280db 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -143,12 +143,11 @@ def run_offline_reintegration_test(self, num_pool, data=False, if data: self.run_ior_thread("Read", oclass, test_seq) self.run_mdtest_thread() - if self.test_during_rebuild is True: - self.container = self.pool_cont_dict[self.pool][0] - kwargs = {"pool": self.pool.uuid, - "cont": self.container.uuid} - output = self.daos_command.container_check(**kwargs) - self.log.info(output) + self.container = self.pool_cont_dict[self.pool][0] + kwargs = {"pool": self.pool.uuid, + "cont": self.container.uuid} + output = self.daos_command.container_check(**kwargs) + self.log.info(output) def test_osa_offline_reintegration_without_checksum(self): """Test ID: DAOS-6923 From d17a081cc4d468a8985f6b47ee1680d0b3506129 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Mon, 29 Mar 2021 01:05:56 -0400 Subject: [PATCH 36/37] DAOS-6923 test: Fix mdtest_test_base Test-tag-hw-medium: pr,hw,medium,ib2 osa Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- src/tests/ftest/util/mdtest_test_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tests/ftest/util/mdtest_test_base.py b/src/tests/ftest/util/mdtest_test_base.py index 029989aa923..4d5f6a633ee 100755 --- a/src/tests/ftest/util/mdtest_test_base.py +++ b/src/tests/ftest/util/mdtest_test_base.py @@ -62,10 +62,10 @@ def execute_mdtest(self): self.run_mdtest(self.get_mdtest_job_manager_command(self.manager), self.processes) - self.stop_dfuse() - # reset self.container if dfs_destroy is True - if self.mdtest_cmd.dfs_destroy is True: + # reset self.container if dfs_destroy is True or None. + if self.mdtest_cmd.dfs_destroy is not False: self.container = None + self.stop_dfuse() def get_mdtest_job_manager_command(self, manager): """Get the MPI job manager command for Mdtest. From 3ef55fe23031e528d19130e760329c7329436841 Mon Sep 17 00:00:00 2001 From: rpadma2 <ravindran.padmanabhan@intel.com> Date: Mon, 29 Mar 2021 10:29:15 -0400 Subject: [PATCH 37/37] DAOS-6923 test: Add log messages Test-tag-hw-medium: pr,hw,medium,ib2 offline_reintegration_daily Signed-off-by: rpadma2 <ravindran.padmanabhan@intel.com> --- .../ftest/osa/osa_offline_reintegration.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/osa/osa_offline_reintegration.py b/src/tests/ftest/osa/osa_offline_reintegration.py index 3ee27c280db..ccd5f3ceed2 100755 --- a/src/tests/ftest/osa/osa_offline_reintegration.py +++ b/src/tests/ftest/osa/osa_offline_reintegration.py @@ -155,11 +155,12 @@ def test_osa_offline_reintegration_without_checksum(self): without enabling checksum in container properties. :avocado: tags=all,pr,daily_regression,hw,medium,ib2 - :avocado: tags=osa,offline_reintegration + :avocado: tags=osa,offline_reintegration_daily :avocado: tags=offline_reintegration_without_csum """ self.test_with_checksum = self.params.get("test_with_checksum", '/run/checksum/*') + self.log.info("Offline Reintegration : Without Checksum") self.run_offline_reintegration_test(1, data=True) def test_osa_offline_reintegration_multiple_pools(self): @@ -168,9 +169,10 @@ def test_osa_offline_reintegration_multiple_pools(self): with multiple pools :avocado: tags=all,daily_regression,hw,medium,ib2 - :avocado: tags=osa,offline_reintegration + :avocado: tags=osa,offline_reintegration_daily :avocado: tags=offline_reintegration_multiple_pools """ + self.log.info("Offline Reintegration : Multiple Pools") self.run_offline_reintegration_test(5, data=True) @skipForTicket("DAOS-6807") @@ -179,9 +181,10 @@ def test_osa_offline_reintegration_server_stop(self): Test Description: Validate Offline Reintegration with server stop :avocado: tags=all,pr,daily_regression,hw,medium,ib2 - :avocado: tags=osa,offline_reintegration + :avocado: tags=osa,offline_reintegration_daily :avocado: tags=offline_reintegration_srv_stop """ + self.log.info("Offline Reintegration : System Start/Stop") self.run_offline_reintegration_test(1, data=True, server_boot=True) def test_osa_offline_reintegrate_during_rebuild(self): @@ -190,13 +193,14 @@ def test_osa_offline_reintegrate_during_rebuild(self): is happening in parallel :avocado: tags=all,full_regression,hw,medium,ib2 - :avocado: tags=osa,offline_reintegration + :avocado: tags=osa,offline_reintegration_full :avocado: tags=offline_reintegrate_during_rebuild """ self.loop_test_cnt = self.params.get("iterations", '/run/loop_test/*') self.test_during_rebuild = self.params.get("test_with_rebuild", '/run/rebuild/*') + self.log.info("Offline Reintegration : Rebuild") self.run_offline_reintegration_test(1, data=True) def test_osa_offline_reintegration_oclass(self): @@ -205,9 +209,10 @@ def test_osa_offline_reintegration_oclass(self): with different object class :avocado: tags=all,full_regression,hw,medium,ib2 - :avocado: tags=osa,offline_reintegration + :avocado: tags=osa,offline_reintegration_full :avocado: tags=offline_reintegration_oclass """ + self.log.info("Offline Reintegration : Object Class") for oclass in self.test_oclass: self.run_offline_reintegration_test(1, data=True, server_boot=False, @@ -219,9 +224,10 @@ def test_osa_offline_reintegrate_during_aggregation(self): is happening in parallel :avocado: tags=all,full_regression,hw,medium,ib2 - :avocado: tags=osa,offline_reintegration + :avocado: tags=osa,offline_reintegration_full :avocado: tags=offline_reintegrate_during_aggregation """ self.test_during_aggregation = self.params.get("test_with_aggregation", '/run/aggregation/*') + self.log.info("Offline Reintegration : Aggregation") self.run_offline_reintegration_test(1, data=True)