Skip to content

Commit

Permalink
Merge branch 'sarats/machinefiles/summit' into master (PR #2188)
Browse files Browse the repository at this point in the history
Adding configuration file sections for Summit using PGI, IBM compilers.
Several settings are adopted from Summitdev, a testbed which was a
precursor to Summit.

 - Added specific compiler sections to config_machines.xml
- Added new machine section in config_machines.xml
- Updated LSF section in config_batch.xml,removed obsolete lsf_old sec.

Note: mpirun.summit script is installed locally in project-wide accessible
directory. Work in progress to tune it for the environment.
Testing: Ran FC5AV1C with ne4_ne4 and ne30_ne30.

[BFB] - Bit-For-Bit
  • Loading branch information
minxu74 authored Mar 23, 2018
2 parents e6e5f54 + 36476de commit 49ca9e9
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 37 deletions.
56 changes: 56 additions & 0 deletions config/e3sm/machines/Depends.summit.pgiacc
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
dyn_comp.o: dyn_comp.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS_NOOPT) $(FREEFLAGS) $<

microp_aero.o: microp_aero.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS_NOOPT) $(FREEFLAGS) $<



bndry_mod.o: bndry_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

derivative_mod.o: derivative_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

edge_mod.o: edge_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

element_mod.o: element_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

element_state.o: element_state.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

openacc_utils_mod.o: openacc_utils_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

prim_advance_mod.o: prim_advance_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

prim_advection_mod.o: prim_advection_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

prim_si_mod.o: prim_si_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

model_init_mod.o: model_init_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

vertremap_mod.o: vertremap_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

viscosity_mod.o: viscosity_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

prim_driver_mod.o: prim_driver_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

prim_driver_base.o: prim_driver_base.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

physics_mod.o: physics_mod.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

physconst.o: physconst.F90
$(FC) -c $(INCLDIR) $(INCS) $(FFLAGS) -DUSE_OPENACC=1 -acc -ta=tesla,cc70,pinned -Minfo=accel $(FREEFLAGS) $<

52 changes: 16 additions & 36 deletions config/e3sm/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,40 +78,8 @@
</submit_args>
</batch_system>

<batch_system type="lsf_old" version="10.1">
<batch_query args=" -w" >bjobs</batch_query>
<batch_submit>bsub</batch_submit>
<batch_cancel>bkill</batch_cancel>
<batch_redirect>&lt;</batch_redirect>
<batch_directive>#BSUB</batch_directive>
<jobid_pattern>&lt;(\d+)&gt;</jobid_pattern>
<depend_string>-w 'done(jobid)'</depend_string>
<depend_separator>&amp;&amp;</depend_separator>
<walltime_format>%H:%M</walltime_format>
<batch_mail_flag>-u</batch_mail_flag>
<batch_mail_type_flag></batch_mail_type_flag>
<batch_mail_type></batch_mail_type>
<submit_args>
<arg flag="-q" name="$JOB_QUEUE"/>
<arg flag="-W" name="$JOB_WALLCLOCK_TIME"/>
<arg flag="-P" name="$CHARGE_ACCOUNT"/>
</submit_args>
<directives>
<directive > -n {{ total_tasks }} </directive>
<directive > -R "span[ptile={{ tasks_per_node }}]"</directive>
<directive > -N </directive>
<!-- The following option causes problems with lsf version on Summitdev.
If desired, this should be in specific machine section. -->
<!-- <directive default="poe" > -a {{ poe }} </directive> -->
<directive default="acme.stdout" > -o {{ job_id }}.%J </directive>
<directive default="acme.stderr" > -e {{ job_id }}.%J </directive>
<directive > -J {{ job_id }} </directive>
</directives>
</batch_system>

<!-- This is the version on Summitdev, released as IBM release beta2 on Oct 17, 2017.
Created a new section as it conflicts with previous LSF settings-->
<batch_system type="lsf" version="2">
<!-- This is the new version on Summit, released as IBM 10.1.0.0 build 476197, Nov 21 2017. -->
<batch_system type="lsf" version="10.1">
<batch_query args=" -w" >bjobs</batch_query>
<batch_submit>bsub</batch_submit>
<batch_cancel>bkill</batch_cancel>
Expand All @@ -131,8 +99,8 @@
<directives>
<directive > -nnodes {{ num_nodes }} </directive>
<directive > -N </directive>
<directive default="acme.stdout" > -o {{ output_error_path }}.%J </directive>
<directive default="acme.stderr" > -e {{ output_error_path }}.%J </directive>
<directive default="e3sm.stdout" > -o {{ output_error_path }}.%J </directive>
<directive default="e3sm.stderr" > -e {{ output_error_path }}.%J </directive>
<directive > -J {{ job_id }} </directive>
</directives>
</batch_system>
Expand Down Expand Up @@ -489,6 +457,18 @@
</queues>
</batch_system>

<batch_system MACH="summit" type="lsf" >
<directives>
<directive>-P {{ project }}</directive>
<directive>-alloc_flags gpumps</directive>
<directive>-alloc_flags smt2</directive>
</directives>
<queues>
<queue walltimemax="02:00" nodemin="0" nodemax="64" default="true">batch</queue>
</queues>
</batch_system>


<batch_system MACH="summitdev" type="lsf" >
<directives>
<directive>-P {{ project }}</directive>
Expand Down
41 changes: 41 additions & 0 deletions config/e3sm/machines/config_compilers.xml
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,47 @@ for mct, etc.
<ALBANY_PATH>/projects/ccsm/libs/AlbanyTrilinos/Albany/build/install</ALBANY_PATH>
</compiler>

<compiler COMPILER="ibm" MACH="summit">
<SFC> xlf_r </SFC>
<SCC> xlc_r </SCC>
<MPIFC> mpixlf </MPIFC>
<MPICC> mpixlc </MPICC>
<MPICXX> mpixlC </MPICXX>
<CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
<PIO_FILESYSTEM_HINTS>lustre</PIO_FILESYSTEM_HINTS>
<ADD_FFLAGS> -qzerosize -qfree=f90 -qxlf2003=polymorphic</ADD_FFLAGS>
<ADD_FFLAGS_NOOPT> -O0 -g -qfree=f90 </ADD_FFLAGS_NOOPT>
<ADD_LDFLAGS>-L$(NETCDF_C_PATH)/lib -lnetcdf -L$(NETCDF_FORTRAN_PATH)/lib -lnetcdff -L$(PNETCDF_PATH)/lib -lpnetcdf -L$(HDF5_PATH)/lib -lhdf5_hl -lhdf5 -L$(ESSL_PATH)/lib64 -lessl -L$(NETLIB_LAPACK_PATH)/lib64 -llapack</ADD_LDFLAGS>
</compiler>

<compiler COMPILER="pgi" MACH="summit">
<SFC> pgfortran </SFC>
<SCC> pgcc </SCC>
<MPIFC> mpif90 </MPIFC>
<MPICC> mpicc </MPICC>
<MPICXX> mpiCC </MPICXX>
<CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
<PIO_FILESYSTEM_HINTS>lustre</PIO_FILESYSTEM_HINTS>
<ADD_CFLAGS DEBUG="FALSE"> -O2 </ADD_CFLAGS>
<ADD_FFLAGS DEBUG="FALSE"> -O2 -DSUMMITDEV_PGI </ADD_FFLAGS>
<ADD_LDFLAGS>-L$(NETCDF_C_PATH)/lib -lnetcdf -L$(NETCDF_FORTRAN_PATH)/lib -lnetcdff -L$(PNETCDF_PATH)/lib -lpnetcdf -L$(HDF5_PATH)/lib -lhdf5_hl -lhdf5 -L$(ESSL_PATH)/lib64 -lessl -L$(NETLIB_LAPACK_PATH)/lib -llapack</ADD_LDFLAGS>
</compiler>

<compiler COMPILER="pgiacc" MACH="summit">
<SFC> pgfortran </SFC>
<SCC> pgcc </SCC>
<MPIFC> mpif90 </MPIFC>
<MPICC> mpicc </MPICC>
<MPICXX> mpiCC </MPICXX>
<CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
<PIO_FILESYSTEM_HINTS>lustre</PIO_FILESYSTEM_HINTS>
<ADD_CFLAGS DEBUG="FALSE"> -O2 </ADD_CFLAGS>
<ADD_FFLAGS DEBUG="FALSE"> -O2 -DSUMMITDEV_PGI </ADD_FFLAGS>
<ADD_LDFLAGS>-ta=tesla:cc70,pinned</ADD_LDFLAGS>
<ADD_LDFLAGS>-L$(NETCDF_C_PATH)/lib -lnetcdf -L$(NETCDF_FORTRAN_PATH)/lib -lnetcdff -L$(PNETCDF_PATH)/lib -lpnetcdf -L$(HDF5_PATH)/lib -lhdf5_hl -lhdf5 -L$(ESSL_PATH)/lib64 -lessl -L$(NETLIB_LAPACK_PATH)/lib -llapack</ADD_LDFLAGS>
</compiler>


<compiler COMPILER="ibm" MACH="summitdev">
<SFC> xlf_r </SFC>
<SCC> xlc_r </SCC>
Expand Down
130 changes: 129 additions & 1 deletion config/e3sm/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2743,7 +2743,7 @@
<arg name="show-processmap"> display-map </arg>
-->

<module_system type="module">
<module_system type="module" allow_error="true">
<!-- list of init_path elements, one per supported language e.g. sh, perl, python-->
<init_path lang="sh">/sw/summitdev/lmod/7.4.0/rhel7.2_gnu4.8.5/lmod/7.4/init/sh</init_path>
<init_path lang="csh">/sw/summitdev/lmod/7.4.0/rhel7.2_gnu4.8.5/lmod/7.4/init/csh</init_path>
Expand Down Expand Up @@ -2820,6 +2820,134 @@
</environment_variables>
</machine>

<machine MACH="summit">
<!-- Ref: https://www.olcf.ornl.gov/for-users/system-user-guides/summit/ -->
<DESC>ORNL Summit. Node: 2x POWER9 + 6x Volta V100, 22 cores/socket, 4 HW threads/core.</DESC>
<NODENAME_REGEX>.*summit.*</NODENAME_REGEX>
<TESTS>e3sm_developer</TESTS>
<COMPILERS>ibm,pgi,pgiacc</COMPILERS>
<MPILIBS>spectrum-mpi,mpi-serial</MPILIBS>
<CIME_OUTPUT_ROOT>$ENV{HOME}/e3sm_scratch/$PROJECT</CIME_OUTPUT_ROOT>
<!-- <CIME_OUTPUT_ROOT>/gpfs/alpinetds/scratch/$ENV{USER}/$PROJECT/e3sm/scratch</CIME_OUTPUT_ROOT> -->
<RUNDIR>/gpfs/alpinetds/scratch/$ENV{USER}/$PROJECT/e3sm/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<DIN_LOC_ROOT>/gpfs/alpinetds/world-shared/csc190/e3sm/cesm/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/gpfs/alpinetds/world-shared/csc190/e3sm/cesm/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
<DOUT_S_ROOT>/gpfs/alpinetds/scratch/$ENV{USER}/$PROJECT/e3sm/archive/$CASE</DOUT_S_ROOT>
<DOUT_L_MSROOT>csm/$CASE</DOUT_L_MSROOT>
<BASELINE_ROOT>/gpfs/alpinetds/world-shared/csc190/e3sm/cesm/baselines/$COMPILER</BASELINE_ROOT>
<CCSM_CPRNC>/gpfs/alpinetds/world-shared/csc190/e3sm/cesm/tools/cprnc/cprnc</CCSM_CPRNC>
<SAVE_TIMING_DIR>/gpfs/alpinetds/proj-shared/$PROJECT/e3sm</SAVE_TIMING_DIR>
<SAVE_TIMING_DIR_PROJECTS>cli115,csc190</SAVE_TIMING_DIR_PROJECTS>
<OS>LINUX</OS>
<BATCH_SYSTEM>lsf</BATCH_SYSTEM>
<SUPPORTED_BY>e3sm</SUPPORTED_BY>
<GMAKE_J>32</GMAKE_J>
<!-- 1 core/socket not available for application, so 168 = 42cores*4 in smt4 mode -->
<MAX_MPITASKS_PER_NODE>84</MAX_MPITASKS_PER_NODE>
<MAX_TASKS_PER_NODE>168</MAX_TASKS_PER_NODE>
<PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
<PROJECT>csc190</PROJECT>
<CHARGE_ACCOUNT>CSC190ACME</CHARGE_ACCOUNT>
<PIO_CONFIG_OPTS> -D PIO_BUILD_TIMING:BOOL=ON </PIO_CONFIG_OPTS>

<mpirun mpilib="spectrum-mpi">
<!-- Use a helper script to tweak jsrun options -->
<executable>/gpfs/alpinetds/world-shared/csc190/e3sm/mpirun.summit</executable>
<!-- <executable>jsrun</executable> -->
<arguments>
<arg name="num_tasks" > -n $TOTALPES -N $MAX_MPITASKS_PER_NODE </arg>
<!-- By default 'jsrun' does not exit (i.e. hung) if any process / thread fails. -->
<!-- To avoid this, pass the option '-X 1' to exit on error -->
<!-- <arg name="num_tasks" > -X 1 - -nrs $SHELL{echo $NUM_NODES * 6 | bc} - -rs_per_host 6 - -tasks_per_rs 28 - -gpu_per_rs 1 - -cpu_per_rs 7 </arg> -->
<!-- plane binding is number of tasks per socket -->
<!-- <arg name="binding">- -latency_priority=cpu-cpu - -bind=proportional-packed:1 -d plane:84 </arg> -->
<!-- The expression below calculates number of nodes (ceil of TOTALPES/MAX_MPITASKS_PER_NODE) -->
<!-- $SHELL{echo "(($TOTALPES + $MAX_MPITASKS_PER_NODE - 1)/$MAX_MPITASKS_PER_NODE)" | bc} -->
</arguments>
</mpirun>
<!-- Useful jsrun options:
-n (hyphen-hyphen)nrs Number of resource sets
-a (hyphen-hyphen)tasks_per_rs Number of tasks per resource set
-c (hyphen-hyphen)cpu_per_rs Number of CPUs per resource set. Threads per rs.
-g (hyphen-hyphen)gpu_per_rs Number of GPUs per resource set
-r (hyphen-hyphen)rs_per_host Number of resource sets per host
-->

<module_system type="module" allow_error="true">
<!-- list of init_path elements, one per supported language e.g. sh, perl, python-->
<init_path lang="sh">/sw/summit/lmod/7.7.10/rhel7.3_gnu4.8.5/lmod/lmod/init/sh</init_path>
<init_path lang="csh">/sw/summit/lmod/7.7.10/rhel7.3_gnu4.8.5/lmod/lmod/init/csh</init_path>
<init_path lang="python">/sw/summit/lmod/7.7.10/rhel7.3_gnu4.8.5/lmod/lmod/init/env_modules_python.py</init_path>
<init_path lang="perl">/sw/summit/lmod/7.7.10/rhel7.3_gnu4.8.5/lmod/lmod/init/perl</init_path>
<!-- list of cmd_path elements, one for every supported language, e.g. sh, perl, python -->
<cmd_path lang="perl">module</cmd_path>
<cmd_path lang="python">/sw/summit/lmod/7.7.10/rhel7.3_gnu4.8.5/lmod/7.7.10/libexec/lmod python</cmd_path>
<cmd_path lang="sh">module</cmd_path>
<cmd_path lang="csh">module</cmd_path>

<!-- Always execute -->
<modules>
<command name="ls"/>
<command name="purge"/>
<command name="ls"/>
<command name="load">DefApps</command>
<command name="load">python/3.5.2</command>
<command name="load">subversion/1.9.3</command>
<command name="load">git/2.13.0</command>
<command name="load">cmake/3.9.2</command>
<command name="load">essl/6.1.0-prpq</command>
<command name="load">netlib-lapack/3.6.1</command>
</modules>
<!-- List of modules elements, executing commands if compiler and mpilib condition applies -->
<modules compiler="pgi">
<command name="rm">xl</command>
<command name="load">pgi/18.1</command>
<command name="load">spectrum-mpi/10.2.0.0-20180110</command>
<command name="ls"/>
</modules>
<modules compiler="ibm">
<command name="rm">pgi</command>
<command name="load">xl/20180223-beta</command>
<command name="load">spectrum-mpi/10.2.0.0-20180110</command>
<command name="ls"/>
</modules>

<!-- mpi lib settings -->
<modules mpilib="mpi-serial">
<command name="load">netcdf/4.4.1</command>
<command name="load">netcdf-fortran/4.4.4</command>
</modules>
<!-- Sometimes,same versions of libraries are not available for different compilers, hence the split below -->
<modules compiler="ibm" mpilib="!mpi-serial">
<command name="load">netcdf/4.4.1</command>
<command name="load">netcdf-fortran/4.4.4</command>
<command name="load">parallel-netcdf/1.8.0</command>
<command name="load">hdf5/1.10.0-patch1</command>
</modules>
<modules compiler="pgi" mpilib="!mpi-serial">
<command name="load">netcdf/4.4.1</command>
<command name="load">netcdf-fortran/4.4.4</command>
<command name="load">parallel-netcdf/1.8.0</command>
<command name="load">hdf5/1.10.0-patch1</command>
</modules>
</module_system>
<!-- Default -->
<environment_variables>
<env name="COMPILER">$COMPILER</env>
<env name="MPILIB">$MPILIB</env>
<env name="OMP_STACKSIZE">128M</env>
<env name="NETCDF_C_PATH">$ENV{OLCF_NETCDF_ROOT}</env>
<env name="NETCDF_FORTRAN_PATH">$ENV{OLCF_NETCDF_FORTRAN_ROOT}</env>
<env name="HDF5_PATH">$ENV{OLCF_HDF5_ROOT}</env>
<env name="ESSL_PATH">$ENV{OLCF_ESSL_ROOT}</env>
<env name="NETLIB_LAPACK_PATH">$ENV{OLCF_NETLIB_LAPACK_ROOT}</env>
</environment_variables>
<environment_variables mpilib="!mpi-serial">
<env name="PNETCDF_PATH">$ENV{OLCF_PARALLEL_NETCDF_ROOT}</env>
</environment_variables>
</machine>

<default_run_suffix>
<default_run_exe>${EXEROOT}/e3sm.exe </default_run_exe>
<default_run_misc_suffix> >> e3sm.log.$LID 2>&amp;1 </default_run_misc_suffix>
Expand Down

0 comments on commit 49ca9e9

Please sign in to comment.