Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change default slurm arguments to None #80

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 39 additions & 36 deletions test_tube/hpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ def __init__(
self.out_log_path = None
self.modules = []
self.script_name = os.path.realpath(sys.argv[0])
self.job_time = '15:00'
self.minutes_to_checkpoint_before_walltime = 5
self.per_experiment_nb_gpus = 1
self.per_experiment_nb_cpus = 1
self.per_experiment_nb_nodes = 1
self.memory_mb_per_node = 2000
self.job_time = None
self.minutes_to_checkpoint_before_walltime = None
self.per_experiment_nb_gpus = None
self.per_experiment_nb_cpus = None
self.per_experiment_nb_nodes = None
self.memory_mb_per_node = None
self.email = None
self.notify_on_end = False
self.notify_on_fail = False
Expand Down Expand Up @@ -397,13 +397,14 @@ def __build_slurm_command(self, trial, slurm_cmd_script_path, timestamp, exp_i,
]
sub_commands.extend(command)

# add job time
command = [
'# time needed for job',
'#SBATCH --time={}'.format(self.job_time),
'#################\n'
]
sub_commands.extend(command)
if self.job_time is not None:
# add job time
command = [
'# time needed for job',
'#SBATCH --time={}'.format(self.job_time),
'#################\n'
]
sub_commands.extend(command)

# add nb of gpus
if self.per_experiment_nb_gpus > 0 and on_gpu:
Expand All @@ -421,38 +422,40 @@ def __build_slurm_command(self, trial, slurm_cmd_script_path, timestamp, exp_i,
sub_commands.extend(command)

# add nb of cpus if not looking at a gpu job
if self.per_experiment_nb_cpus > 0:
if self.per_experiment_nb_cpus is not None:
command = [
'# cpus per job',
'#SBATCH --cpus-per-task={}'.format(self.per_experiment_nb_cpus),
'#################\n'
]
sub_commands.extend(command)

# pick nb nodes
command = [
'# number of requested nodes',
'#SBATCH --nodes={}'.format(self.per_experiment_nb_nodes),
'#################\n'
]
sub_commands.extend(command)

# pick memory per node
command = [
'# memory per node',
'#SBATCH --mem={}'.format(self.memory_mb_per_node),
'#################\n'
]
sub_commands.extend(command)
if self.per_experiment_nb_nodes is not None:
# pick nb nodes
command = [
'# number of requested nodes',
'#SBATCH --nodes={}'.format(self.per_experiment_nb_nodes),
'#################\n'
]
sub_commands.extend(command)

# add signal command to catch job termination
command = [
'# slurm will send a signal this far out before it kills the job',
f'#SBATCH --signal=USR1@{self.minutes_to_checkpoint_before_walltime * 60}',
'#################\n'
]
if self.memory_mb_per_node is not None:
# pick memory per node
command = [
'# memory per node',
'#SBATCH --mem={}'.format(self.memory_mb_per_node),
'#################\n'
]
sub_commands.extend(command)

sub_commands.extend(command)
if self.minutes_to_checkpoint_before_walltime is not None:
# add signal command to catch job termination
command = [
'# slurm will send a signal this far out before it kills the job',
f'#SBATCH --signal=USR1@{self.minutes_to_checkpoint_before_walltime * 60}',
'#################\n'
]
sub_commands.extend(command)

# Subscribe to email if requested
mail_type = []
Expand Down