Feature: System Wisteria GPU version (#204)

* finished wisteria system now passes TestFlow, bugfix Pyaflowa preprocessing not allowing NoneType datacase for forward simulations only * bump version number * bump version number docs * updating custom gpu wisteria runscript to include latest cuda version * bugfix prepare data for solver was not properly evaluating data paths * adding missing import statement * removing last merge conflict * api function name change update for fujitsu/wisteria system modules * bugfix job tasktime was using walltime value * updates log message for assertion warning to include actual missing path * bugfix missing environ input in wisteria system * Adding more comments to custom wisteria scripts * removing env variable from custom wisteria run scripts for now because they are causing issues bugfix: wisteria was not properly pathing run scripts but working due to two complemtnary bugs * fixing environ implementation in wisteria * updating resource group numbers based on users guide * shifts wisteria functionality into fujitsu to keep wisteria overwrite to a minimum, since it has been tested combines custom_run-wisteria with GPU version by adding a flag to the run script that toggles GPU mode on and off, for simplicity and less files, also makes the custom run script a bit more generalizable * removing unncessary imports and unused variables * fixing up the new custom run script functionality * fixing bash syntax errors in custon run wisteria * updating cuda version in loaded modules * okay now it works, does not like backslash newlines in bash commands * update changelog
adjtomo · Mar 28, 2024 · d7661ee · d7661ee
1 parent 209e41b
commit d7661ee
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 145 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,16 @@
 # CHANGELOG
 
-## v3.0.1 (#203)
+## v3.0.2 (#204)
+System Wisteria GPU Upgrades
+
+- Bugfix: Fujitsu tasktime for individual job submission was using `walltime` value, not `tasktime` value
+- Combined and condensed main System functionality in Fujitsu system from Wisteria child class. Prior to this Wisteria child class was overwriting most of the functionality of Fujitsu which is not really the point of inheritance
+- New Custom Run and Submit scriopts for Wisteria GPU. Better comments and slightly easier to modify for others
+- Added new `rscgrps` to include GPU partitions on Wisteria
+- Improved run call header for easier switching between CPU and GPU nodes
+
+## v3.0.1 (#203) 
+Quality of Life Updates
 
 - Solver now automatically generates VTK files for Models and Gradients at the end of each iteration
     - New function `solver.specfem.make_output_vtk_files` that generates .vtk files for all files in the output/ directory

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "seisflows"
-version = "3.0.1"
+version = "3.0.2"
 description = "An automated workflow tool for full waveform inversion"
 readme = "README.md"
 requires-python = ">=3.7"

diff --git a/seisflows/system/fujitsu.py b/seisflows/system/fujitsu.py
@@ -4,6 +4,7 @@
 Computing Suite). 
 
 .. note::
+
     The nickname `PJM`, based on the batch job script directives, may be used 
     as a shorthand to refer to the Fujitsu job management system.
 
@@ -18,10 +19,10 @@
 import subprocess
 
 from datetime import timedelta
-from seisflows import ROOT_DIR, logger
+from seisflows import logger
 from seisflows.system.cluster import Cluster
 from seisflows.tools import msg
-from seisflows.tools.config import pickle_function_list
+from seisflows.tools.config import pickle_function_list, import_seisflows   
 
 
 class Fujitsu(Cluster):
@@ -75,7 +76,6 @@ def __init__(self, ntask_max=100, pjm_args="", **kwargs):
         self._failed_states = ["CANCEL", "HOLD", "ERROR"]
         self._pending_states = ["QUEUED", "RUNNING"]
 
-
     def check(self):
         """
         Checks parameters and paths
@@ -146,42 +146,51 @@ def run_call_header(self):
              f"-N {self.title}",  # job name
              f"-o {os.path.join(self.path.log_files, '%j')}", 
              f"-j",  # merge stderr with stdout
-             f"-L elapse={self._walltime}",  # [[hour:]minute:]second
+             f"-L elapse={self._tasktime}",  # [[hour:]minute:]second
              f"-L node={self.nodes}",
              f"--mpi proc={self.nproc}",
         ])
         return _call
+
+    def submit(self, workdir=None, parameter_file="parameters.yaml", 
+               direct=True):
+        """
+        Submit main workflow to the System. Two options are available,
+        submitting a Python job directly to the system, or submitting a 
+        subprocess.
 
-    def submit(self, workdir=None, parameter_file="parameters.yaml"):            
-        """                                                                      
-        Submits the main workflow job as a separate job submitted directly to    
-        the system that is running the master job. 
-
-        .. note::
-                
-            Fujitsu scheduler doesn't allow command line arugments 
-            (e.g., --workdir), so these are assumed to be default values where
-            the workdir is ${pwd} and the parameter file is called 
-            'parameters.yaml'
-                                                                                 
         :type workdir: str                                                       
         :param workdir: path to the current working directory                    
         :type parameter_file: str                                                
         :param parameter_file: paramter file file name used to instantiate       
-            the SeisFlows package                                                
-        """                                                                      
-        # e.g., submit -w ./ -p parameters.yaml                                  
-        submit_call = " ".join([                                                 
-            f"{self.submit_call_header}",
-            f"{self.submit_workflow}",
-        ])
-
-        logger.debug(submit_call)                                                
-        try:                                                                     
-            subprocess.run(submit_call, shell=True)                              
-        except subprocess.CalledProcessError as e:                               
-            logger.critical(f"SeisFlows master job has failed with: {e}")        
-            sys.exit(-1)     
+            the SeisFlows package    
+        :type direct: bool
+        :param direct: (used for overriding system modules) submits the main 
+            workflow job directly to the login node as a Python process 
+            (default). If False, submits the main job as a separate subprocess.
+            Note that this is Fujitsu specific and main jobs should be run from
+            interactive jobs run on compute nodes to avoid running jobs on
+            shared login resources
+        """
+        if direct:
+            workflow = import_seisflows(workdir=workdir or self.path.workdir,        
+                                        parameter_file=parameter_file)               
+            workflow.check()                                                         
+            workflow.setup()                                                         
+            workflow.run()    
+        else:
+            # e.g., submit -w ./ -p parameters.yaml                                  
+            submit_call = " ".join([                                                 
+                f"{self.submit_call_header}",
+                f"{self.submit_workflow}",
+            ])
+
+            logger.debug(submit_call)                                                
+            try:                                                                     
+                subprocess.run(submit_call, shell=True)                              
+            except subprocess.CalledProcessError as e:                               
+                logger.critical(f"SeisFlows master job has failed with: {e}")        
+                sys.exit(-1)                                        
 
     @staticmethod                                                                
     def _stdout_to_job_id(stdout):                                               
@@ -215,11 +224,8 @@ def run(self, funcs, single=False, **kwargs):
         cluster. Executes the list of functions (`funcs`) NTASK times with each
         task occupying NPROC cores.
 
-        .. warning::
-            This has not been tested generally on Fujitsu systems, see system
-            Wisteria for a working application of the Fujitsu module
-
         .. note::
+        
             Completely overwrites the `Cluster.run()` command
 
         :type funcs: list of methods
@@ -244,15 +250,23 @@ def run(self, funcs, single=False, **kwargs):
                         f"system {self.ntask} times")
             _ntask = self.ntask
 
+        # If no environs, ensure there is not trailing comma
+        if self.environs:
+            self.environs = f",{self.environs}"
+
         # Default Fujitsu command line input, can be overloaded by subclasses
         # Copy-paste this default run_call and adjust accordingly for subclass
         job_ids = []
         for taskid in range(_ntask):
             run_call = " ".join([
                 f"{self.run_call_header}",
-		f"--funcs {funcs_fid}",
-		f"--kwargs {kwargs_fid}",
-		f"--environment SEISFLOWS_TASKID={{task_id}},{self.environs}"
+                # -x in 'pjsub' sets environment variables which are distributed
+                # in the run script, see custom run scripts for example how
+                # Ensure that these are comma-separated, not space-separated
+                f"-x SEISFLOWS_FUNCS={funcs_fid},"  
+                f"SEISFLOWS_KWARGS={kwargs_fid},"
+                f"SEISFLOWS_TASKID={taskid}{self.environs},"
+                f"GPU_MODE={int(bool(self.gpu))}",  # 0 if False, 1 if True
                 f"{self.run_functions}",
             ])
 
@@ -266,7 +280,7 @@ def run(self, funcs, single=False, **kwargs):
 
         # Monitor the job queue until all jobs have completed, or any one fails
         try:
-            status = self.check_job_status(job_id)
+            status = self.monitor_job_status(job_ids)
         except FileNotFoundError:
             logger.critical(f"cannot access job information through 'pjstat', "
                             f"waited 50s with no return, please check job "
@@ -351,7 +365,7 @@ def query_job_states(self, job_id, timeout_s=300, wait_time_s=30,
             if _recheck > (timeout_s // wait_time_s):
                 raise TimeoutError(f"cannot access job ID {job_id}")
             time.sleep(wait_time_s)
-            query_job_states(job_id, _recheck=_recheck)
+            self.query_job_states(job_id, _recheck=_recheck)
 
         return job_ids, job_states
 
diff --git a/seisflows/system/runscripts/custom_run-wisteria b/seisflows/system/runscripts/custom_run-wisteria
@@ -2,20 +2,41 @@
 # ==============================================================================
 # This is a Wisteria (UTokyo HPC) specific run script that is required 
 # because the compute node does not inherit the login node's Conda environment.
-# Instead we need to load the module and environment manually, before run
+# Instead we need to load the module and environment manually, before run.
+#
+# User needs to set the following paths:
+# WORK_DIR: path to the directory where the Conda environment is stored, and 
+#   where the SeisFlows repository has been cloned
+# CONDA_ENV: name of the Conda environment to be used
+# GPU_MODE: needs to be set by the calling function, if GPU_MODE=='GPU', then
+#  the script will load the GPU specific environment
 # ==============================================================================
 
+# Defines where our Conda environment is saved and what its name is
 WORK_DIR=/work/01/gr58/share/adjtomo
+CONDA_ENV=adjtomo
+echo "work environment set as: '$WORK_DIR'"
 
 # Load MPI and activate Conda environment
-module load intel
-module load impi
+if [ $GPU_MODE -eq 1 ]; then  # Use GPUs
+	echo "loading GPU modules on compute node"
+    module load cuda/12.2
+    module load gcc
+    module load ompi-cuda
+else
+	echo "loading CPU modules on compute node"
+    module load intel
+    module load impi
+fi
+
+# Conda will be common to GPU or CPU versions
+echo "loading Conda environment: $CONDA_ENV"
 module load miniconda/py38_4.9.2
 source $MINICONDA_DIR/etc/profile.d/conda.sh
-conda activate $WORK_DIR/conda/envs/adjtomo 
+conda activate $WORK_DIR/conda/envs/$CONDA_ENV 
 
-# Run Functions: ensure that we are using the correct Python version 
 
+# Run Functions: ensure that we are using the correct Python version 
 # The following environment variables must be set by the '-x' flag in the 
 # corresponding system.run() function:
 # ---
@@ -24,4 +45,5 @@ conda activate $WORK_DIR/conda/envs/adjtomo
 # SEISFLOWS_ENV: any additional environment variables
 # SEISFLOWS_TASKID: assigned processor number for given task
 # ---
-$WORK_DIR/conda/envs/adjtomo/bin/python $WORK_DIR/REPOSITORIES/seisflows/seisflows/system/runscripts/run --funcs $SEISFLOWS_FUNCS --kwargs $SEISFLOWS_KWARGS --environment SEISFLOWS_TASKID=$SEISFLOWS_TASKID,$SEISFLOWS_ENV
+$WORK_DIR/conda/envs/$CONDA_ENV/bin/python $WORK_DIR/REPOSITORIES/seisflows/seisflows/system/runscripts/run --funcs $SEISFLOWS_FUNCS --kwargs $SEISFLOWS_KWARGS --environment SEISFLOWS_TASKID=$SEISFLOWS_TASKID,$SEISFLOWS_ENV 
+
diff --git a/seisflows/system/runscripts/custom_submit-wisteria b/seisflows/system/runscripts/custom_submit-wisteria
@@ -5,6 +5,7 @@
 # Instead we need to load the module and environment manually, before submit
 # ==============================================================================
 
+# Defines where our Conda environment is stored
 WORK_DIR=/work/01/gr58/share/adjtomo
 
 # Load MPI and activate Conda environment