Skip to content

Commit

Permalink
[kjobctl] Add --mem flag to Slurm mode (#3169)
Browse files Browse the repository at this point in the history
* Add --mem flag

* Add test case for non-exact division

* Align units for memory
  • Loading branch information
IrvingMg authored Oct 3, 2024
1 parent c499188 commit 42f7604
Show file tree
Hide file tree
Showing 9 changed files with 193 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const (
GpusPerTaskFlag Flag = "gpus-per-task"
InputFlag Flag = "input"
JobNameFlag Flag = "job-name"
MemPerNodeFlag Flag = "mem"
MemPerCPUFlag Flag = "mem-per-cpu"
MemPerGPUFlag Flag = "mem-per-gpu"
MemPerTaskFlag Flag = "mem-per-task"
Expand Down Expand Up @@ -84,6 +85,7 @@ type TemplateReference string
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('gpus-per-task' in self.requiredFlags) || self.name == 'Slurm'", message="gpus-per-task flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('input' in self.requiredFlags) || self.name == 'Slurm'", message="input flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('job-name' in self.requiredFlags) || self.name == 'Slurm'", message="job-name flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('mem' in self.requiredFlags) || self.name == 'Slurm'", message="mem flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('mem-per-cpu' in self.requiredFlags) || self.name == 'Slurm'", message="mem-per-cpu flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('mem-per-gpu' in self.requiredFlags) || self.name == 'Slurm'", message="mem-per-gpu flag can be used only on Slurm mode"
// +kubebuilder:validation:XValidation:rule="!has(self.requiredFlags) || !('mem-per-task' in self.requiredFlags) || self.name == 'Slurm'", message="mem-per-task flag can be used only on Slurm mode"
Expand Down Expand Up @@ -120,7 +122,7 @@ type SupportedMode struct {
// The request flag used only for Interactive and Job modes.
// The cmd flag used only for Interactive, Job, and RayJob.
// If the raycluster flag are set, none of localqueue, replicas, min-replicas, or max-replicas can be set.
// For the Slurm mode, the possible values are: array, cpus-per-task, error, gpus-per-task, input, job-name, mem-per-cpu,
// For the Slurm mode, the possible values are: array, cpus-per-task, error, gpus-per-task, input, job-name, mem, mem-per-cpu,
// mem-per-gpu, mem-per-task, nodes, ntasks, output, partition, localqueue, priority.
//
// cmd and requests values are going to be added only to the first primary container.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ spec:
The request flag used only for Interactive and Job modes.
The cmd flag used only for Interactive, Job, and RayJob.
If the raycluster flag are set, none of localqueue, replicas, min-replicas, or max-replicas can be set.
For the Slurm mode, the possible values are: array, cpus-per-task, error, gpus-per-task, input, job-name, mem-per-cpu,
For the Slurm mode, the possible values are: array, cpus-per-task, error, gpus-per-task, input, job-name, mem, mem-per-cpu,
mem-per-gpu, mem-per-task, nodes, ntasks, output, partition, localqueue, priority.
cmd and requests values are going to be added only to the first primary container.
Expand Down Expand Up @@ -159,6 +159,9 @@ spec:
- message: job-name flag can be used only on Slurm mode
rule: '!has(self.requiredFlags) || !(''job-name'' in self.requiredFlags)
|| self.name == ''Slurm'''
- message: mem flag can be used only on Slurm mode
rule: '!has(self.requiredFlags) || !(''mem'' in self.requiredFlags)
|| self.name == ''Slurm'''
- message: mem-per-cpu flag can be used only on Slurm mode
rule: '!has(self.requiredFlags) || !(''mem-per-cpu'' in self.requiredFlags)
|| self.name == ''Slurm'''
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The file is auto-generated from the Go source code of the component using the
Create a slurm job

```
kjobctl create slurm --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--ignore-unknown-flags] [--skip-priority-validation] -- [--array ARRAY] [--cpus-per-task QUANTITY] [--gpus-per-task QUANTITY] [--mem-per-task QUANTITY] [--mem-per-cpu QUANTITY] [--mem-per-gpu QUANTITY] [--nodes COUNT] [--ntasks COUNT] [--output FILENAME_PATTERN] [--error FILENAME_PATTERN] [--input FILENAME_PATTERN] [--job-name NAME] [--partition NAME] [--priority NAME] SCRIPT
kjobctl create slurm --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEUE_NAME] [--skip-localqueue-validation] [--ignore-unknown-flags] [--skip-priority-validation] -- [--array ARRAY] [--cpus-per-task QUANTITY] [--gpus-per-task QUANTITY] [--mem QUANTITY] [--mem-per-task QUANTITY] [--mem-per-cpu QUANTITY] [--mem-per-gpu QUANTITY] [--nodes COUNT] [--ntasks COUNT] [--output FILENAME_PATTERN] [--error FILENAME_PATTERN] [--input FILENAME_PATTERN] [--job-name NAME] [--partition NAME] [--priority NAME] SCRIPT
```


Expand Down
6 changes: 6 additions & 0 deletions cmd/experimental/kjobctl/pkg/builder/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ type Builder struct {
gpusPerTask map[string]*resource.Quantity
input string
jobName string
memPerNode *resource.Quantity
memPerCPU *resource.Quantity
memPerGPU *resource.Quantity
memPerTask *resource.Quantity
Expand Down Expand Up @@ -221,6 +222,11 @@ func (b *Builder) WithJobName(jobName string) *Builder {
return b
}

func (b *Builder) WithMemPerNode(memPerNode *resource.Quantity) *Builder {
b.memPerNode = memPerNode
return b
}

func (b *Builder) WithMemPerCPU(memPerCPU *resource.Quantity) *Builder {
b.memPerCPU = memPerCPU
return b
Expand Down
25 changes: 25 additions & 0 deletions cmd/experimental/kjobctl/pkg/builder/slurm_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ func (b *slurmBuilder) complete() error {

func (b *slurmBuilder) validateMutuallyExclusiveFlags() error {
flags := map[string]bool{
string(v1alpha1.MemPerNodeFlag): b.memPerNode != nil,
string(v1alpha1.MemPerTaskFlag): b.memPerTask != nil,
string(v1alpha1.MemPerCPUFlag): b.memPerCPU != nil,
string(v1alpha1.MemPerGPUFlag): b.memPerGPU != nil,
Expand Down Expand Up @@ -340,6 +341,12 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj
}
}

limits := corev1.ResourceList{}
if b.memPerNode != nil {
memPerContainer := b.memPerNode.MilliValue() / int64(len(job.Spec.Template.Spec.Containers))
limits[corev1.ResourceMemory] = *resource.NewMilliQuantity(memPerContainer, b.memPerNode.Format)
}

if b.memPerTask != nil {
requests[corev1.ResourceMemory] = *b.memPerTask
}
Expand All @@ -360,6 +367,10 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj
container.Resources.Requests = b.requests
}

if len(limits) > 0 {
container.Resources.Limits = limits
}

container.VolumeMounts = append(container.VolumeMounts,
corev1.VolumeMount{
Name: "slurm-scripts",
Expand Down Expand Up @@ -626,6 +637,16 @@ func (b *slurmBuilder) getSbatchEnvs() error {
}
}

if b.memPerNode == nil {
if env, ok := os.LookupEnv("SBATCH_MEM_PER_NODE"); ok {
val, err := resource.ParseQuantity(env)
if err != nil {
return fmt.Errorf("cannot parse '%s': %w", env, err)
}
b.memPerNode = ptr.To(val)
}
}

if b.memPerTask == nil {
if env, ok := os.LookupEnv("SBATCH_MEM_PER_CPU"); ok {
val, err := resource.ParseQuantity(env)
Expand Down Expand Up @@ -687,6 +708,10 @@ func (b *slurmBuilder) replaceScriptFlags() error {
b.gpusPerTask = scriptFlags.GpusPerTask
}

if b.memPerNode == nil {
b.memPerNode = scriptFlags.MemPerNode
}

if b.memPerTask == nil {
b.memPerTask = scriptFlags.MemPerTask
}
Expand Down
15 changes: 15 additions & 0 deletions cmd/experimental/kjobctl/pkg/cmd/create/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ const (
arrayFlagName = string(v1alpha1.ArrayFlag)
cpusPerTaskFlagName = string(v1alpha1.CpusPerTaskFlag)
gpusPerTaskFlagName = string(v1alpha1.GpusPerTaskFlag)
memPerNodeFlagName = string(v1alpha1.MemPerNodeFlag)
memPerTaskFlagName = string(v1alpha1.MemPerTaskFlag)
memPerCPUFlagName = string(v1alpha1.MemPerCPUFlag)
memPerGPUFlagName = string(v1alpha1.MemPerGPUFlag)
Expand Down Expand Up @@ -179,6 +180,7 @@ type CreateOptions struct {
Array string
CpusPerTask *apiresource.Quantity
GpusPerTask map[string]*apiresource.Quantity
MemPerNode *apiresource.Quantity
MemPerTask *apiresource.Quantity
MemPerCPU *apiresource.Quantity
MemPerGPU *apiresource.Quantity
Expand All @@ -200,6 +202,7 @@ type CreateOptions struct {
UserSpecifiedRequest map[string]string
UserSpecifiedCpusPerTask string
UserSpecifiedGpusPerTask string
UserSpecifiedMemPerNode string
UserSpecifiedMemPerTask string
UserSpecifiedMemPerCPU string
UserSpecifiedMemPerGPU string
Expand Down Expand Up @@ -328,6 +331,7 @@ var createModeSubcommands = map[string]modeSubcommand{
" [--array ARRAY]" +
" [--cpus-per-task QUANTITY]" +
" [--gpus-per-task QUANTITY]" +
" [--mem QUANTITY]" +
" [--mem-per-task QUANTITY]" +
" [--mem-per-cpu QUANTITY]" +
" [--mem-per-gpu QUANTITY]" +
Expand Down Expand Up @@ -363,6 +367,8 @@ The minimum index value is 0. The maximum index value is 2147483647.`)
"How much cpus a container inside a pod requires.")
o.SlurmFlagSet.StringVar(&o.UserSpecifiedGpusPerTask, gpusPerTaskFlagName, "",
"How much gpus a container inside a pod requires.")
o.SlurmFlagSet.StringVar(&o.UserSpecifiedMemPerNode, memPerNodeFlagName, "",
"How much memory a pod requires.")
o.SlurmFlagSet.StringVar(&o.UserSpecifiedMemPerTask, memPerTaskFlagName, "",
"How much memory a container requires.")
o.SlurmFlagSet.StringVar(&o.UserSpecifiedMemPerCPU, memPerCPUFlagName, "",
Expand Down Expand Up @@ -528,6 +534,14 @@ func (o *CreateOptions) Complete(clientGetter util.ClientGetter, cmd *cobra.Comm
o.GpusPerTask = gpusPerTask
}

if o.SlurmFlagSet.Changed(memPerNodeFlagName) {
quantity, err := apiresource.ParseQuantity(o.UserSpecifiedMemPerNode)
if err != nil {
return fmt.Errorf("cannot parse '%s': %w", o.UserSpecifiedMemPerNode, err)
}
o.MemPerNode = &quantity
}

if o.SlurmFlagSet.Changed(memPerTaskFlagName) {
quantity, err := apiresource.ParseQuantity(o.UserSpecifiedMemPerTask)
if err != nil {
Expand Down Expand Up @@ -612,6 +626,7 @@ func (o *CreateOptions) Run(ctx context.Context, clientGetter util.ClientGetter,
WithArray(o.Array).
WithCpusPerTask(o.CpusPerTask).
WithGpusPerTask(o.GpusPerTask).
WithMemPerNode(o.MemPerNode).
WithMemPerTask(o.MemPerTask).
WithMemPerCPU(o.MemPerCPU).
WithMemPerGPU(o.MemPerGPU).
Expand Down
132 changes: 132 additions & 0 deletions cmd/experimental/kjobctl/pkg/cmd/create/create_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,138 @@ cd /mydir
},
wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`,
},
"should divide --mem exactly across containers": {
beforeTest: beforeSlurmTest,
afterTest: afterSlurmTest,
args: func(tc *createCmdTestCase) []string {
return []string{
"slurm",
"--profile", "profile",
"--",
"--mem", "2G",
tc.tempFile,
}
},
kjobctlObjs: []runtime.Object{
wrappers.MakeJobTemplate("slurm-job-template", metav1.NamespaceDefault).
WithContainer(*wrappers.MakeContainer("c1", "bash:4.4").Obj()).
WithContainer(*wrappers.MakeContainer("c2", "bash:4.4").Obj()).
Obj(),
wrappers.MakeApplicationProfile("profile", metav1.NamespaceDefault).
WithSupportedMode(*wrappers.MakeSupportedMode(v1alpha1.SlurmMode, "slurm-job-template").Obj()).
Obj(),
},
gvks: []schema.GroupVersionKind{
{Group: "batch", Version: "v1", Kind: "Job"},
{Group: "", Version: "v1", Kind: "ConfigMap"},
},
wantLists: []runtime.Object{
&batchv1.JobList{
TypeMeta: metav1.TypeMeta{Kind: "JobList", APIVersion: "batch/v1"},
Items: []batchv1.Job{
*wrappers.MakeJob("", metav1.NamespaceDefault).
Completions(1).
CompletionMode(batchv1.IndexedCompletion).
Profile("profile").
Mode(v1alpha1.SlurmMode).
WithContainer(*wrappers.MakeContainer("c1", "bash:4.4").
Command("bash", "/slurm/scripts/entrypoint.sh").
WithResources(corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("1G"),
},
}).
Obj()).
WithContainer(*wrappers.MakeContainer("c2", "bash:4.4").
Command("bash", "/slurm/scripts/entrypoint.sh").
WithResources(corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("1G"),
},
}).
Obj()).
Obj(),
},
},
&corev1.ConfigMapList{},
},
cmpopts: []cmp.Option{
cmpopts.IgnoreFields(corev1.LocalObjectReference{}, "Name"),
cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"),
cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers"),
cmpopts.IgnoreTypes([]corev1.EnvVar{}),
cmpopts.IgnoreTypes([]corev1.Volume{}),
cmpopts.IgnoreTypes([]corev1.VolumeMount{}),
cmpopts.IgnoreTypes(corev1.ConfigMapList{}),
},
wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`,
},
"should handle non-exact --mem division across containers": {
beforeTest: beforeSlurmTest,
afterTest: afterSlurmTest,
args: func(tc *createCmdTestCase) []string {
return []string{
"slurm",
"--profile", "profile",
"--",
"--mem", "1G",
tc.tempFile,
}
},
kjobctlObjs: []runtime.Object{
wrappers.MakeJobTemplate("slurm-job-template", metav1.NamespaceDefault).
WithContainer(*wrappers.MakeContainer("c1", "bash:4.4").Obj()).
WithContainer(*wrappers.MakeContainer("c2", "bash:4.4").Obj()).
Obj(),
wrappers.MakeApplicationProfile("profile", metav1.NamespaceDefault).
WithSupportedMode(*wrappers.MakeSupportedMode(v1alpha1.SlurmMode, "slurm-job-template").Obj()).
Obj(),
},
gvks: []schema.GroupVersionKind{
{Group: "batch", Version: "v1", Kind: "Job"},
{Group: "", Version: "v1", Kind: "ConfigMap"},
},
wantLists: []runtime.Object{
&batchv1.JobList{
TypeMeta: metav1.TypeMeta{Kind: "JobList", APIVersion: "batch/v1"},
Items: []batchv1.Job{
*wrappers.MakeJob("", metav1.NamespaceDefault).
Completions(1).
CompletionMode(batchv1.IndexedCompletion).
Profile("profile").
Mode(v1alpha1.SlurmMode).
WithContainer(*wrappers.MakeContainer("c1", "bash:4.4").
Command("bash", "/slurm/scripts/entrypoint.sh").
WithResources(corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("500M"),
},
}).
Obj()).
WithContainer(*wrappers.MakeContainer("c2", "bash:4.4").
Command("bash", "/slurm/scripts/entrypoint.sh").
WithResources(corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("500M"),
},
}).
Obj()).
Obj(),
},
},
&corev1.ConfigMapList{},
},
cmpopts: []cmp.Option{
cmpopts.IgnoreFields(corev1.LocalObjectReference{}, "Name"),
cmpopts.IgnoreFields(metav1.OwnerReference{}, "Name"),
cmpopts.IgnoreFields(corev1.PodSpec{}, "InitContainers"),
cmpopts.IgnoreTypes([]corev1.EnvVar{}),
cmpopts.IgnoreTypes([]corev1.Volume{}),
cmpopts.IgnoreTypes([]corev1.VolumeMount{}),
cmpopts.IgnoreTypes(corev1.ConfigMapList{}),
},
wantOutPattern: `job\.batch\/.+ created\\nconfigmap\/.+ created`,
},
"should create slurm with --priority flag": {
beforeTest: beforeSlurmTest,
afterTest: afterSlurmTest,
Expand Down
Binary file modified cmd/experimental/kjobctl/pkg/cmd/printcrds/embed/manifest.gz
Binary file not shown.
7 changes: 7 additions & 0 deletions cmd/experimental/kjobctl/pkg/parser/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type ParsedSlurmFlags struct {
Array string
CpusPerTask *resource.Quantity
GpusPerTask map[string]*resource.Quantity
MemPerNode *resource.Quantity
MemPerTask *resource.Quantity
MemPerCPU *resource.Quantity
MemPerGPU *resource.Quantity
Expand Down Expand Up @@ -96,6 +97,12 @@ func SlurmFlags(script string, ignoreUnknown bool) (ParsedSlurmFlags, error) {
flags.Input = val
case "J", string(v1alpha1.JobNameFlag):
flags.JobName = val
case string(v1alpha1.MemPerNodeFlag):
memPerNode, err := resource.ParseQuantity(val)
if err != nil {
return flags, fmt.Errorf("cannot parse '%s': %w", val, err)
}
flags.MemPerNode = ptr.To(memPerNode)
case string(v1alpha1.MemPerCPUFlag):
memPerCPU, err := resource.ParseQuantity(val)
if err != nil {
Expand Down

0 comments on commit 42f7604

Please sign in to comment.