From d2197e075d9e9f8692c0c802fb64d02502c81b2b Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 2 Nov 2022 14:14:20 -0600 Subject: [PATCH 1/6] add GPU_TYPE and GPU_OFFLOAD variables --- CIME/XML/env_mach_specific.py | 3 ++- CIME/build.py | 14 ++++++++++++++ CIME/data/config/xml_schemas/config_machines.xsd | 4 ++++ CIME/data/config/xml_schemas/env_mach_specific.xsd | 4 ++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py index 5e75efbfb06..46e99d71e2d 100644 --- a/CIME/XML/env_mach_specific.py +++ b/CIME/XML/env_mach_specific.py @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None): def _compute_actions(self, nodes, child_tag, case, job=None): result = [] # list of tuples ("name", "argument") - compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB") + compiler = case.get_value("COMPILER") + mpilib = case.get_value("MPILIB") for node in nodes: if self._match_attribs(self.attrib(node), case, job=job): diff --git a/CIME/build.py b/CIME/build.py index 24a50e332a4..1f9c19e0843 100644 --- a/CIME/build.py +++ b/CIME/build.py @@ -239,6 +239,20 @@ def get_standard_cmake_args(case, sharedpath, shared_lib=False): cmake_args += " -Dcompile_threaded={} ".format( stringify_bool(case.get_build_threaded()) ) + gpu_type = case.get_value("GPU_TYPE") + gpu_offload = case.get_value("GPU_OFFLOAD") + + if gpu_type != "none": + expect( + gpu_offload != "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) + cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}" + else: + expect( + gpu_offload == "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) ocn_model = case.get_value("COMP_OCN") atm_model = case.get_value("COMP_ATM") diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd index 72f4be2491e..23be38304b8 100644 --- a/CIME/data/config/xml_schemas/config_machines.xsd +++ b/CIME/data/config/xml_schemas/config_machines.xsd @@ -6,6 +6,8 @@ + + @@ -248,6 +250,8 @@ + + diff --git a/CIME/data/config/xml_schemas/env_mach_specific.xsd b/CIME/data/config/xml_schemas/env_mach_specific.xsd index ea169205bfe..c777188422d 100644 --- a/CIME/data/config/xml_schemas/env_mach_specific.xsd +++ b/CIME/data/config/xml_schemas/env_mach_specific.xsd @@ -9,6 +9,8 @@ + + @@ -102,6 +104,8 @@ + + From 915166797c85a96ea27370180c2671c5c92042be Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 3 Nov 2022 07:09:19 -0600 Subject: [PATCH 2/6] do not use none in gpu_type --- CIME/Tools/Makefile | 4 +++- CIME/XML/env_batch.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile index 8cf5ba1104e..b1cb5d95523 100644 --- a/CIME/Tools/Makefile +++ b/CIME/Tools/Makefile @@ -613,7 +613,9 @@ endif # Remove arch flag if it exists F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS)) - +ifdef GPUFLAGS + F90_LDFLAGS += $(GPUFLAGS) +endif # Machine stuff to appear last on the link step ifndef MLIBS MLIBS := diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py index de051bd1d47..31fa64b1ba8 100644 --- a/CIME/XML/env_batch.py +++ b/CIME/XML/env_batch.py @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job): if name: if "$" in name: rflag = self._resolve_argument(case, flag, name, job) + # This is to prevent -gpu_type=none in qsub args + if rflag.endswith("=none"): + continue if len(rflag) > len(flag): submitargs += " {}".format(rflag) else: From 2cb56df753b8eb58e60ca0875f78cba67bdb096a Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 3 Nov 2022 08:10:36 -0600 Subject: [PATCH 3/6] add gpu_type and gpu_offload options to create_newcase --- CIME/case/case.py | 56 +++++++++++++++------------------- CIME/scripts/create_newcase.py | 18 +++++++++++ CIME/test_scheduler.py | 9 ++++-- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/CIME/case/case.py b/CIME/case/case.py index ff0a3ec85ea..5e0b82138cb 100644 --- a/CIME/case/case.py +++ b/CIME/case/case.py @@ -1261,6 +1261,8 @@ def configure( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): expect( @@ -1509,47 +1511,35 @@ def configure( # ---------------------------------------------------------------------------------------------------------- # Sanity check: - # 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU - # 2. For compilers without the string "gpu" in the name: - # 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as - # the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect). - # 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument - # must be set to 0. Otherwise, an error will be triggered. - # 3. For compilers with the string "gpu" in the name: - # 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered. - # 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE + # 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS + # 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE # XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically. - # 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. + # 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. # ---------------------------------------------------------------------------------------------------------- max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE") - if max_gpus_per_node: - if "gpu" in compiler: - if not ngpus_per_node: - ngpus_per_node = 1 - logger.warning( - "Setting ngpus_per_node to 1 for compiler {}".format(compiler) - ) - expect( - ngpus_per_node > 0, - " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) - else: - expect( - ngpus_per_node == 0, - " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) + if gpu_type: + expect(max_gpus_per_node, "GPUS are not defined for this machine") + expect( + gpu_offload, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + self.set_value("GPU_TYPE", gpu_type) + self.set_value("GPU_OFFLOAD", gpu_offload) + if ngpus_per_node >= 0: self.set_value( "NGPUS_PER_NODE", - ngpus_per_node + max(1, ngpus_per_node) if ngpus_per_node <= max_gpus_per_node else max_gpus_per_node, ) + elif gpu_offload: + expect( + False, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + self.initialize_derived_attributes() # -------------------------------------------- @@ -2308,6 +2298,8 @@ def create( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): try: # Set values for env_case.xml @@ -2381,6 +2373,8 @@ def create( extra_machines_dir=extra_machines_dir, case_group=case_group, ngpus_per_node=ngpus_per_node, + gpu_type=gpu_type, + gpu_offload=gpu_offload, ) self.create_caseroot() diff --git a/CIME/scripts/create_newcase.py b/CIME/scripts/create_newcase.py index 3faea5d6553..76d750eb23b 100755 --- a/CIME/scripts/create_newcase.py +++ b/CIME/scripts/create_newcase.py @@ -269,6 +269,18 @@ def parse_command_line(args, cimeroot, description): help="Specify number of GPUs used for simulation. ", ) + parser.add_argument( + "--gpu-type", + default=None, + help="Specify type of GPU hardware - currently supported are v100, a100, mi250", + ) + + parser.add_argument( + "--gpu-offload", + default=None, + help="Specify gpu offload method - currently supported are OpenACC, OpenMP", + ) + args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser) if args.srcroot is not None: @@ -345,6 +357,8 @@ def parse_command_line(args, cimeroot, description): args.extra_machines_dir, args.case_group, args.ngpus_per_node, + args.gpu_type, + args.gpu_offload, ) @@ -382,6 +396,8 @@ def _main_func(description=None): extra_machines_dir, case_group, ngpus_per_node, + gpu_type, + gpu_offload, ) = parse_command_line(sys.argv, cimeroot, description) if script_root is None: @@ -447,6 +463,8 @@ def _main_func(description=None): extra_machines_dir=extra_machines_dir, case_group=case_group, ngpus_per_node=ngpus_per_node, + gpu_type=gpu_type, + gpu_offload=gpu_offload, ) # Called after create since casedir does not exist yet diff --git a/CIME/test_scheduler.py b/CIME/test_scheduler.py index d46d9e0d3b1..5d857f4bedf 100644 --- a/CIME/test_scheduler.py +++ b/CIME/test_scheduler.py @@ -661,8 +661,13 @@ def _create_newcase_phase(self, test): pesize = case_opt[1:] create_newcase_cmd += " --pecount {}".format(pesize) elif case_opt.startswith("G"): - ngpus_per_node = case_opt[1:] - create_newcase_cmd += " --ngpus-per-node {}".format(ngpus_per_node) + if "-" in case_opt: + ngpus_per_node, gpu_type, gpu_offload = case_opt[1:].split("-") + else: + error = "GPU test argument format is ngpus_per_node-gpu_type-gpu_offload" + self._log_output(test, error) + return False, error + create_newcase_cmd += f" --ngpus-per-node {ngpus_per_node} --gpu-type {gpu_type} --gpu-offload {gpu_offload}" elif case_opt.startswith("V"): self._cime_driver = case_opt[1:] create_newcase_cmd += " --driver {}".format(self._cime_driver) From b03129b78724eed9773b7871597c998365cc8ebf Mon Sep 17 00:00:00 2001 From: James Edwards Date: Thu, 3 Nov 2022 08:57:58 -0600 Subject: [PATCH 4/6] fix unit test --- CIME/tests/test_unit_case.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CIME/tests/test_unit_case.py b/CIME/tests/test_unit_case.py index dd2cd7cb439..8a320b98365 100755 --- a/CIME/tests/test_unit_case.py +++ b/CIME/tests/test_unit_case.py @@ -182,6 +182,8 @@ def test_copy( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ) create_caseroot.assert_called() apply_user_mods.assert_called() From c1d60ba301efe8fe8ad8b637d1b818d440726e42 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Thu, 3 Nov 2022 09:05:17 -0600 Subject: [PATCH 5/6] one more unit test fix --- CIME/tests/test_unit_case.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CIME/tests/test_unit_case.py b/CIME/tests/test_unit_case.py index 8a320b98365..163a75df03d 100755 --- a/CIME/tests/test_unit_case.py +++ b/CIME/tests/test_unit_case.py @@ -259,6 +259,8 @@ def test_create( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ) create_caseroot.assert_called() apply_user_mods.assert_called() From 3f4b1ab60532578c079af2a358126cf0fe841de1 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Thu, 3 Nov 2022 15:02:47 -0600 Subject: [PATCH 6/6] improve gpu functionality --- CIME/case/case.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/CIME/case/case.py b/CIME/case/case.py index 5e0b82138cb..6585bfb16ed 100644 --- a/CIME/case/case.py +++ b/CIME/case/case.py @@ -123,6 +123,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False self._env_generic_files = [] self._files = [] self._comp_interface = None + self.gpu_enabled = None self._non_local = non_local self.read_xml() @@ -450,6 +451,11 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None): return [] def get_value(self, item, attribute=None, resolved=True, subgroup=None): + if item == "GPU_ENABLED": + if self.gpu_enabled == None: + if self.get_value("GPU_TYPE") != "none": + self.gpu_enabled = True + return "true" if self.gpu_enabled else "false" result = None for env_file in self._files: # Wait and resolve in self rather than in env_file @@ -1385,14 +1391,19 @@ def configure( if not dmax: dmax = machobj.get_value(name) if dmax: + print(f"here name is {name} and dmax is {dmax}") self.set_value(name, dmax) elif name == "MAX_GPUS_PER_NODE": logger.debug( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) else: logger.warning( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) machdir = machobj.get_machines_dir() @@ -1518,14 +1529,17 @@ def configure( # ---------------------------------------------------------------------------------------------------------- max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE") if gpu_type: - expect(max_gpus_per_node, "GPUS are not defined for this machine") + expect( + max_gpus_per_node, + f"GPUS are not defined for machine={machine_name} and compiler={compiler}", + ) expect( gpu_offload, "Both gpu-type and gpu-offload must be defined if either is defined", ) self.set_value("GPU_TYPE", gpu_type) self.set_value("GPU_OFFLOAD", gpu_offload) - + self.gpu_enabled = True if ngpus_per_node >= 0: self.set_value( "NGPUS_PER_NODE",