Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gpu improvements #4334

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CIME/Tools/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,9 @@ endif

# Remove arch flag if it exists
F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))

ifdef GPUFLAGS
F90_LDFLAGS += $(GPUFLAGS)
endif
# Machine stuff to appear last on the link step
ifndef MLIBS
MLIBS :=
Expand Down
3 changes: 3 additions & 0 deletions CIME/XML/env_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job):
if name:
if "$" in name:
rflag = self._resolve_argument(case, flag, name, job)
# This is to prevent -gpu_type=none in qsub args
if rflag.endswith("=none"):
continue
if len(rflag) > len(flag):
submitargs += " {}".format(rflag)
else:
Expand Down
3 changes: 2 additions & 1 deletion CIME/XML/env_mach_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):

def _compute_actions(self, nodes, child_tag, case, job=None):
result = [] # list of tuples ("name", "argument")
compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
compiler = case.get_value("COMPILER")
mpilib = case.get_value("MPILIB")

for node in nodes:
if self._match_attribs(self.attrib(node), case, job=job):
Expand Down
14 changes: 14 additions & 0 deletions CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,20 @@ def get_standard_cmake_args(case, sharedpath, shared_lib=False):
cmake_args += " -Dcompile_threaded={} ".format(
stringify_bool(case.get_build_threaded())
)
gpu_type = case.get_value("GPU_TYPE")
gpu_offload = case.get_value("GPU_OFFLOAD")

if gpu_type != "none":
expect(
gpu_offload != "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Starting a discussion from E3SM perspective:
As this mandates GPU_OFFLOAD, we probably have to add a third option to handle the case for C++ codebases that use a library approach as it won't be OpenMP or OpenACC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sarats - there is an ongoing discussion in #4334, I will copy this comment there.

)
cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
else:
expect(
gpu_offload == "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)

ocn_model = case.get_value("COMP_OCN")
atm_model = case.get_value("COMP_ATM")
Expand Down
56 changes: 25 additions & 31 deletions CIME/case/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,8 @@ def configure(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):

expect(
Expand Down Expand Up @@ -1509,47 +1511,35 @@ def configure(

# ----------------------------------------------------------------------------------------------------------
# Sanity check:
# 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
# 2. For compilers without the string "gpu" in the name:
# 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
# the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
# 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
# must be set to 0. Otherwise, an error will be triggered.
# 3. For compilers with the string "gpu" in the name:
# 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
# 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
# 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
# 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
if max_gpus_per_node:
if "gpu" in compiler:
if not ngpus_per_node:
ngpus_per_node = 1
logger.warning(
"Setting ngpus_per_node to 1 for compiler {}".format(compiler)
)
expect(
ngpus_per_node > 0,
" ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
else:
expect(
ngpus_per_node == 0,
" ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
if gpu_type:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on removing this init stub for ngpus as it makes the definition explicit.

expect(max_gpus_per_node, "GPUS are not defined for this machine")
expect(
gpu_offload,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
self.set_value("GPU_TYPE", gpu_type)
self.set_value("GPU_OFFLOAD", gpu_offload)

if ngpus_per_node >= 0:
self.set_value(
"NGPUS_PER_NODE",
ngpus_per_node
max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)

elif gpu_offload:
expect(
False,
"Both gpu-type and gpu-offload must be defined if either is defined",
)

self.initialize_derived_attributes()

# --------------------------------------------
Expand Down Expand Up @@ -2308,6 +2298,8 @@ def create(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):
try:
# Set values for env_case.xml
Expand Down Expand Up @@ -2381,6 +2373,8 @@ def create(
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
gpu_type=gpu_type,
gpu_offload=gpu_offload,
)

self.create_caseroot()
Expand Down
4 changes: 4 additions & 0 deletions CIME/data/config/xml_schemas/config_machines.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
<xs:attribute name="compiler" type="xs:string"/>
<xs:attribute name="mpilib" type="xs:string"/>
<xs:attribute name="comp_interface" type="xs:string"/>
<xs:attribute name="gpu_type" type="xs:string"/>
<xs:attribute name="gpu_offload" type="xs:string"/>
<xs:attribute name="queue" type="xs:string"/>
<xs:attribute name="DEBUG" type="upperBoolean"/>
<xs:attribute name="PIO_VERSION" type="xs:integer"/>
Expand Down Expand Up @@ -248,6 +250,8 @@
<xs:attribute ref="PIO_VERSION"/>
<xs:attribute ref="mpilib"/>
<xs:attribute ref="comp_interface"/>
<xs:attribute ref="gpu_offload"/>
<xs:attribute ref="gpu_type"/>
</xs:complexType>
</xs:element>
<xs:element name="command">
Expand Down
4 changes: 4 additions & 0 deletions CIME/data/config/xml_schemas/env_mach_specific.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
<xs:attribute name="PIO_VERSION" type="xs:integer"/>
<xs:attribute name="mpilib" type="xs:string"/>
<xs:attribute name="comp_interface" type="xs:string"/>
<xs:attribute name="gpu_type" type="xs:string"/>
<xs:attribute name="gpu_offload" type="xs:string"/>
<xs:attribute name="SMP_PRESENT" type="xs:string"/>
<xs:attribute name="value" type="xs:string"/>
<xs:attribute name="unit_testing" type="xs:boolean"/>
Expand Down Expand Up @@ -102,6 +104,8 @@
<xs:attribute ref="PIO_VERSION" />
<xs:attribute ref="mpilib"/>
<xs:attribute ref="comp_interface"/>
<xs:attribute ref="gpu_type"/>
<xs:attribute ref="gpu_offload"/>
</xs:complexType>
</xs:element>

Expand Down
18 changes: 18 additions & 0 deletions CIME/scripts/create_newcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,18 @@ def parse_command_line(args, cimeroot, description):
help="Specify number of GPUs used for simulation. ",
)

parser.add_argument(
"--gpu-type",
default=None,
help="Specify type of GPU hardware - currently supported are v100, a100, mi250",
)

parser.add_argument(
"--gpu-offload",
default=None,
help="Specify gpu offload method - currently supported are OpenACC, OpenMP",
)

args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser)

if args.srcroot is not None:
Expand Down Expand Up @@ -345,6 +357,8 @@ def parse_command_line(args, cimeroot, description):
args.extra_machines_dir,
args.case_group,
args.ngpus_per_node,
args.gpu_type,
args.gpu_offload,
)


Expand Down Expand Up @@ -382,6 +396,8 @@ def _main_func(description=None):
extra_machines_dir,
case_group,
ngpus_per_node,
gpu_type,
gpu_offload,
) = parse_command_line(sys.argv, cimeroot, description)

if script_root is None:
Expand Down Expand Up @@ -447,6 +463,8 @@ def _main_func(description=None):
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
gpu_type=gpu_type,
gpu_offload=gpu_offload,
)

# Called after create since casedir does not exist yet
Expand Down
9 changes: 7 additions & 2 deletions CIME/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,8 +661,13 @@ def _create_newcase_phase(self, test):
pesize = case_opt[1:]
create_newcase_cmd += " --pecount {}".format(pesize)
elif case_opt.startswith("G"):
ngpus_per_node = case_opt[1:]
create_newcase_cmd += " --ngpus-per-node {}".format(ngpus_per_node)
if "-" in case_opt:
ngpus_per_node, gpu_type, gpu_offload = case_opt[1:].split("-")
else:
error = "GPU test argument format is ngpus_per_node-gpu_type-gpu_offload"
self._log_output(test, error)
return False, error
create_newcase_cmd += f" --ngpus-per-node {ngpus_per_node} --gpu-type {gpu_type} --gpu-offload {gpu_offload}"
elif case_opt.startswith("V"):
self._cime_driver = case_opt[1:]
create_newcase_cmd += " --driver {}".format(self._cime_driver)
Expand Down
4 changes: 4 additions & 0 deletions CIME/tests/test_unit_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def test_copy(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
)
create_caseroot.assert_called()
apply_user_mods.assert_called()
Expand Down Expand Up @@ -257,6 +259,8 @@ def test_create(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
)
create_caseroot.assert_called()
apply_user_mods.assert_called()
Expand Down