From e830a703ed53519dc71196a2e238c974f10f52d9 Mon Sep 17 00:00:00 2001 From: Vanessasaurus <814322+vsoch@users.noreply.github.com> Date: Tue, 8 Aug 2023 20:16:05 -0600 Subject: [PATCH] wip to add resources (#27) * wip to add resources * add support to ask for thread level detail for the perf-sysstat metric this is not tested yet - I am going to test on Google Cloud with >1 node Signed-off-by: vsoch --- README.md | 2 - api/v1alpha1/metric_types.go | 20 ++++ api/v1alpha1/zz_generated.deepcopy.go | 58 ++++++++++++ .../bases/flux-framework.org_metricsets.yaml | 27 ++++++ docs/_static/data/metrics.json | 14 +-- .../custom-resource-definition.md | 48 +++++++++- docs/getting_started/metrics.md | 8 +- examples/dist/metrics-operator-arm.yaml | 26 ++++++ examples/dist/metrics-operator.yaml | 26 ++++++ pkg/metrics/application.go | 9 +- pkg/metrics/containers.go | 11 ++- pkg/metrics/jobset.go | 13 ++- pkg/metrics/logs.go | 15 ++- pkg/metrics/network/netmark.go | 14 ++- pkg/metrics/network/osu-benchmark.go | 13 ++- pkg/metrics/perf/sysstat.go | 66 +++++++++---- pkg/metrics/resources.go | 93 +++++++++++++++++++ pkg/metrics/storage.go | 5 +- sdk/python/v1alpha1/CHANGELOG.md | 1 + sdk/python/v1alpha1/setup.py | 2 +- 20 files changed, 421 insertions(+), 50 deletions(-) create mode 100644 pkg/metrics/resources.go diff --git a/README.md b/README.md index fa444aa..0ebf85d 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,7 @@ To learn more: ## Dinosaur TODO - We need a way for the entrypoint command to monitor (based on the container) to differ (potentially) -- add resource limits / requests - make flux operator command generator -- Find better logging library for logging outside of controller (go 1.21 has a logging library!) - For larger metric collections, we should have a log streaming mode (and not wait for Completed/Successful) - For services we are measuring, we likely need to be able to kill after N seconds (to complete job) or to specify the success policy on the metrics containers instead of the application - Python function to save entire spec to yaml (for MetricSet and JobSet)? diff --git a/api/v1alpha1/metric_types.go b/api/v1alpha1/metric_types.go index a52c95a..3bd5a39 100644 --- a/api/v1alpha1/metric_types.go +++ b/api/v1alpha1/metric_types.go @@ -66,6 +66,10 @@ type MetricSetSpec struct { // +optional Pods int32 `json:"pods"` + // Resources include limits and requests for each pod (that include a JobSet) + // +optional + Resources ContainerResource `json:"resources"` + // Single pod completion, meaning the jobspec completions is unset // and we only require one main completion // +optional @@ -98,11 +102,27 @@ type Application struct { //+optional PullSecret string `json:"pullSecret"` + // Resources include limits and requests for the application + // +optional + Resources ContainerResources `json:"resources"` + // Existing Volumes for the application // +optional Volumes map[string]Volume `json:"volumes"` } +// ContainerResources include limits and requests +type ContainerResources struct { + + // +optional + Limits ContainerResource `json:"limits"` + + // +optional + Requests ContainerResource `json:"requests"` +} + +type ContainerResource map[string]intstr.IntOrString + // A Volume should correspond with an existing volume, either: // config map, secret, or claim name. This will be added soon. type Volume struct { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index a2b2cc9..00e4245 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -29,6 +29,7 @@ import ( // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Application) DeepCopyInto(out *Application) { *out = *in + in.Resources.DeepCopyInto(&out.Resources) if in.Volumes != nil { in, out := &in.Volumes, &out.Volumes *out = make(map[string]Volume, len(*in)) @@ -48,6 +49,56 @@ func (in *Application) DeepCopy() *Application { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in ContainerResource) DeepCopyInto(out *ContainerResource) { + { + in := &in + *out = make(ContainerResource, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerResource. +func (in ContainerResource) DeepCopy() ContainerResource { + if in == nil { + return nil + } + out := new(ContainerResource) + in.DeepCopyInto(out) + return *out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ContainerResources) DeepCopyInto(out *ContainerResources) { + *out = *in + if in.Limits != nil { + in, out := &in.Limits, &out.Limits + *out = make(ContainerResource, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Requests != nil { + in, out := &in.Requests, &out.Requests + *out = make(ContainerResource, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerResources. +func (in *ContainerResources) DeepCopy() *ContainerResources { + if in == nil { + return nil + } + out := new(ContainerResources) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Metric) DeepCopyInto(out *Metric) { *out = *in @@ -180,6 +231,13 @@ func (in *MetricSetSpec) DeepCopyInto(out *MetricSetSpec) { } in.Storage.DeepCopyInto(&out.Storage) in.Application.DeepCopyInto(&out.Application) + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = make(ContainerResource, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricSetSpec. diff --git a/config/crd/bases/flux-framework.org_metricsets.yaml b/config/crd/bases/flux-framework.org_metricsets.yaml index 97d14bb..a023aa5 100644 --- a/config/crd/bases/flux-framework.org_metricsets.yaml +++ b/config/crd/bases/flux-framework.org_metricsets.yaml @@ -51,6 +51,24 @@ spec: pullSecret: description: A pull secret for the application container type: string + resources: + description: Resources include limits and requests for the application + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object volumes: additionalProperties: description: 'A Volume should correspond with an existing volume, @@ -172,6 +190,15 @@ spec: description: Parallelism (e.g., pods) format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + description: Resources include limits and requests for each pod (that + include a JobSet) + type: object serviceName: default: ms description: Service name for the JobSet (MetricsSet) cluster network diff --git a/docs/_static/data/metrics.json b/docs/_static/data/metrics.json index 9504e12..557aead 100644 --- a/docs/_static/data/metrics.json +++ b/docs/_static/data/metrics.json @@ -1,4 +1,11 @@ [ + { + "name": "io-sysstat", + "description": "statistics for Linux tasks (processes) : I/O, CPU, memory, etc.", + "type": "storage", + "image": "ghcr.io/converged-computing/metric-sysstat:latest", + "url": "https://github.com/sysstat/sysstat" + }, { "name": "network-netmark", "description": "point to point networking tool", @@ -19,12 +26,5 @@ "type": "application", "image": "ghcr.io/converged-computing/metric-sysstat:latest", "url": "https://github.com/sysstat/sysstat" - }, - { - "name": "io-sysstat", - "description": "statistics for Linux tasks (processes) : I/O, CPU, memory, etc.", - "type": "storage", - "image": "ghcr.io/converged-computing/metric-sysstat:latest", - "url": "https://github.com/sysstat/sysstat" } ] \ No newline at end of file diff --git a/docs/getting_started/custom-resource-definition.md b/docs/getting_started/custom-resource-definition.md index 0974b5b..de76d54 100644 --- a/docs/getting_started/custom-resource-definition.md +++ b/docs/getting_started/custom-resource-definition.md @@ -91,6 +91,18 @@ spec: An application is allowed to have one or more existing volumes. An existing volume can be any of the types described in [existing volumes](#existing-volumes) +#### resources + +Resource lists for an application container go under [Overhead](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-overhead/). Known keys include "memory" and "cpu" (should be provided in some +string format that can be parsed) and all others are considered some kind of quantity request. + +```yaml +pod: + resources: + memory: 500M + cpu: 4 +``` + ### storage When you want to measure some storage performance, you'll want to add a "storage" section to your MetricSet. This will typically just be a reference to some existing storage (see [existing volumes](#existing-volumes)) that we want to measure, and can @@ -120,7 +132,7 @@ spec: rate: 20 ``` -### completions +#### completions Completions for a metric are relevant if you are assessing storage (which doesn't have an application runtime) or a service application that will continue to run forever. When this value is set to 0, it essentially indicates no set number of completions (meaning we run forever). Any non-zero value will ensure the metric runs for that many completions before exiting. @@ -134,7 +146,7 @@ spec: This is usually suggested to provide for a storage metric. -### options +#### options Metrics can take custom options, which are key value pairs of a string key and either string or integer value. These come in three types: @@ -164,6 +176,38 @@ Presence of absence of an option type depends on the metric. Metrics are free to options as they see fit. +## resources + +Resources for an entire spec are given to the Pod template of the Job. They can include limits and requests. Known keys include "memory" and "cpu" (should be provided in some +string format that can be parsed) and all others are considered some kind of quantity request. + +```yaml +resources: + limits: + memory: 500M + cpu: 4 +``` + +If you wanted to, for example, request a GPU, that might look like: + +```yaml +resources: + limits: + gpu-vendor.example/example-gpu: 1 +``` + +Or for a particulat type of networking fabric: + +```yaml +resources: + limits: + vpc.amazonaws.com/efa: 1 +``` + +Both limits and resources are flexible to accept a string or an integer value, and you'll get an error if you +provide something else. If you need something else, [let us know](https://github.com/converged-computing/metrics-operator/issues). +If you are requesting GPU, [this documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) is helpful. + ## Existing Volumes An existing volume can be provided to support an application (multiple) or one can be provided for assessing its performance (single). diff --git a/docs/getting_started/metrics.md b/docs/getting_started/metrics.md index 8f1b91c..472227b 100644 --- a/docs/getting_started/metrics.md +++ b/docs/getting_started/metrics.md @@ -31,7 +31,8 @@ This metric provides the "pidstat" executable of the sysstat library. The follow |Name | Description | Type | Default | |-----|-------------|------------|------| | color | Set to turn on color parsing | Anything set | unset | -| pids | For debugging, show consistent output of ps aux | Anything set | Unset | +| pids | For debugging, show consistent output of ps aux | Anything set | unset | +| threads | add `-t` to each pidstat command to indicate wanting thread-level output | unset | By default color and pids are set to false anticipating log parsing. And we also provide the option to see "commands" or specific commands based on a job index to the metric. @@ -51,8 +52,11 @@ and the rest (workers). "all": /usr/libexec/flux/cmd/flux-broker --config /etc/flux/config -Scron.directory=/etc/flux/system/cron.d -Stbon.fanout "0": /usr/bin/python3.8 /usr/libexec/flux/cmd/flux-submit.py -n 2 --quiet --watch lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite ``` + In the map above, order matters, as the command for all indices is first set to be the flux-broker one, and then -after the index at 0 gets a custom command. +after the index at 0 gets a custom command. See [pidstat](https://man7.org/linux/man-pages/man1/pidstat.1.html) for +more information on this command, and [this file](https://github.com/converged-computing/metrics-operator/blob/main/pkg/metrics/perf/sysstat.go) +for how we use them. If there is an option or command that is not exposed that you would like, please [open an issue](https://github.com/converged-computing/metrics-operator/issues). ### Storage diff --git a/examples/dist/metrics-operator-arm.yaml b/examples/dist/metrics-operator-arm.yaml index 64bad40..2cc23f8 100644 --- a/examples/dist/metrics-operator-arm.yaml +++ b/examples/dist/metrics-operator-arm.yaml @@ -57,6 +57,24 @@ spec: pullSecret: description: A pull secret for the application container type: string + resources: + description: Resources include limits and requests for the application + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object volumes: additionalProperties: description: 'A Volume should correspond with an existing volume, either: config map, secret, or claim name. This will be added soon.' @@ -164,6 +182,14 @@ spec: description: Parallelism (e.g., pods) format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + description: Resources include limits and requests for each pod (that include a JobSet) + type: object serviceName: default: ms description: Service name for the JobSet (MetricsSet) cluster network diff --git a/examples/dist/metrics-operator.yaml b/examples/dist/metrics-operator.yaml index f229162..427dc12 100644 --- a/examples/dist/metrics-operator.yaml +++ b/examples/dist/metrics-operator.yaml @@ -57,6 +57,24 @@ spec: pullSecret: description: A pull secret for the application container type: string + resources: + description: Resources include limits and requests for the application + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + type: object + type: object volumes: additionalProperties: description: 'A Volume should correspond with an existing volume, either: config map, secret, or claim name. This will be added soon.' @@ -164,6 +182,14 @@ spec: description: Parallelism (e.g., pods) format: int32 type: integer + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + x-kubernetes-int-or-string: true + description: Resources include limits and requests for each pod (that include a JobSet) + type: object serviceName: default: ms description: Service name for the JobSet (MetricsSet) cluster network diff --git a/pkg/metrics/application.go b/pkg/metrics/application.go index 50ac6ab..6404b51 100644 --- a/pkg/metrics/application.go +++ b/pkg/metrics/application.go @@ -8,8 +8,6 @@ SPDX-License-Identifier: MIT package metrics import ( - "fmt" - api "github.com/converged-computing/metrics-operator/api/v1alpha1" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) @@ -45,7 +43,10 @@ func GetApplicationReplicatedJobs( m := (*metric) // This defaults to one replicated job, named "m" - job := GetReplicatedJob(spec, shareProcessNamespace, spec.Spec.Pods, spec.Spec.Completions, "") + job, err := GetReplicatedJob(spec, shareProcessNamespace, spec.Spec.Pods, spec.Spec.Completions, "") + if err != nil { + return rjs, err + } // Add volumes expecting an application. GetVolumes creates metric entrypoint volumes // and adds existing volumes (application) to our set of mounts. We need both @@ -75,7 +76,7 @@ func GetApplicationReplicatedJobs( ) if err != nil { - fmt.Printf("There was an error getting containers for %s: %s\n", m.Name(), err) + logger.Errorf("There was an error getting containers for %s: %s\n", m.Name(), err) return rjs, err } job.Template.Spec.Template.Spec.Containers = containers diff --git a/pkg/metrics/containers.go b/pkg/metrics/containers.go index eda86a0..0bb65ec 100644 --- a/pkg/metrics/containers.go +++ b/pkg/metrics/containers.go @@ -115,7 +115,16 @@ func GetContainers( } // If our metric set has an application, add it last + // We currently accept resources for an application (but not metrics yet) if set.HasApplication() { + + // Prepare container resources + resources, err := getContainerResources(&set.Spec.Application.Resources) + logger.Info("🌀 Application", "Container.Resources", resources) + if err != nil { + return containers, err + } + command := []string{"/bin/bash", "-c", set.Spec.Application.Entrypoint} appContainer := corev1.Container{ Name: "app", @@ -131,6 +140,6 @@ func GetContainers( } containers = append(containers, appContainer) } - fmt.Printf("🟪️ Adding %d containers\n", len(containers)) + logger.Infof("🟪️ Adding %d containers\n", len(containers)) return containers, nil } diff --git a/pkg/metrics/jobset.go b/pkg/metrics/jobset.go index 4f2f356..54e6aa3 100644 --- a/pkg/metrics/jobset.go +++ b/pkg/metrics/jobset.go @@ -117,7 +117,7 @@ func GetReplicatedJob( pods int32, completions int32, jobname string, -) *jobset.ReplicatedJob { +) (*jobset.ReplicatedJob, error) { // Default replicated job name, if not set if jobname == "" { @@ -182,8 +182,15 @@ func GetReplicatedJob( } // Should we add resources back? - // jobspec.Template.Spec.Overhead = resources + resources, err := getPodResources(set) + logger.Info("🌀 MiniCluster", "Pod.Resources", resources) + if err != nil { + logger.Info("🌀 MiniCluster", "Pod.Resources", resources) + return &job, err + } + jobspec.Template.Spec.Overhead = resources + // Tie the jobspec to the job job.Template.Spec = jobspec - return &job + return &job, nil } diff --git a/pkg/metrics/logs.go b/pkg/metrics/logs.go index 3245a20..45f72a4 100644 --- a/pkg/metrics/logs.go +++ b/pkg/metrics/logs.go @@ -10,9 +10,11 @@ package metrics import ( "encoding/json" "fmt" + "log" api "github.com/converged-computing/metrics-operator/api/v1alpha1" "github.com/converged-computing/metrics-operator/pkg/utils" + "go.uber.org/zap" "k8s.io/apimachinery/pkg/util/intstr" ) @@ -21,6 +23,8 @@ var ( Separator = "METRICS OPERATOR TIMEPOINT" CollectionStart = "METRICS OPERATOR COLLECTION START" CollectionEnd = "METRICS OPERATOR COLLECTION END" + handle *zap.Logger + logger *zap.SugaredLogger ) // Metric Export is a flattened structure with minimal required metadata for now @@ -81,9 +85,18 @@ func Metadata(set *api.MetricSet, metric *Metric) string { } metadata, err := json.Marshal(export) if err != nil { - fmt.Printf("Warning, error serializing spec metadata: %s", err.Error()) + logger.Errorf("Warning, error serializing spec metadata: %s", err.Error()) } // We need to escape the quotes for printing in bash metadataEscaped := utils.EscapeCharacters(string(metadata)) return fmt.Sprintf("METADATA START %s\nMETADATA END", metadataEscaped) } + +func init() { + handle, err := zap.NewProduction() + if err != nil { + log.Fatalf("can't initialize zap logger: %v", err) + } + logger = handle.Sugar() + defer handle.Sync() +} diff --git a/pkg/metrics/network/netmark.go b/pkg/metrics/network/netmark.go index 875f28b..de82c6a 100644 --- a/pkg/metrics/network/netmark.go +++ b/pkg/metrics/network/netmark.go @@ -105,9 +105,18 @@ func (n Netmark) getMetricsKeyToPath() []corev1.KeyToPath { // Replicated Jobs are custom for this standalone metric func (m Netmark) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJob, error) { + js := []jobset.ReplicatedJob{} + // Generate a replicated job for the launcher (netmark) and workers - launcher := metrics.GetReplicatedJob(spec, false, 1, 1, "n") - workers := metrics.GetReplicatedJob(spec, false, spec.Spec.Pods-1, spec.Spec.Pods-1, "w") + launcher, err := metrics.GetReplicatedJob(spec, false, 1, 1, "n") + if err != nil { + return js, err + } + + workers, err := metrics.GetReplicatedJob(spec, false, spec.Spec.Pods-1, spec.Spec.Pods-1, "w") + if err != nil { + return js, err + } // Add volumes defined under storage. v := map[string]api.Volume{} @@ -137,7 +146,6 @@ func (m Netmark) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJob, er Command: []string{"/bin/bash", m.workerScript}, }, } - js := []jobset.ReplicatedJob{} // Derive the containers, one per metric // This will also include mounts for volumes diff --git a/pkg/metrics/network/osu-benchmark.go b/pkg/metrics/network/osu-benchmark.go index 9cb0ba5..56b107e 100644 --- a/pkg/metrics/network/osu-benchmark.go +++ b/pkg/metrics/network/osu-benchmark.go @@ -100,9 +100,17 @@ func (n OSUBenchmark) getMetricsKeyToPath() []corev1.KeyToPath { // Replicated Jobs are custom for this standalone metric func (m OSUBenchmark) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJob, error) { + js := []jobset.ReplicatedJob{} + // Generate a replicated job for the launcher (netmark) and workers - launcher := metrics.GetReplicatedJob(spec, false, 1, 1, "l") - workers := metrics.GetReplicatedJob(spec, false, 1, 1, "w") + launcher, err := metrics.GetReplicatedJob(spec, false, 1, 1, "l") + if err != nil { + return js, err + } + workers, err := metrics.GetReplicatedJob(spec, false, 1, 1, "w") + if err != nil { + return js, err + } // Add volumes defined under storage. v := map[string]api.Volume{} @@ -132,7 +140,6 @@ func (m OSUBenchmark) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJo WorkingDir: m.WorkingDir(), }, } - js := []jobset.ReplicatedJob{} // Derive the containers, one per metric // This will also include mounts for volumes diff --git a/pkg/metrics/perf/sysstat.go b/pkg/metrics/perf/sysstat.go index 0af974d..c66c690 100644 --- a/pkg/metrics/perf/sysstat.go +++ b/pkg/metrics/perf/sysstat.go @@ -28,9 +28,10 @@ type PidStat struct { container string // Options - useColor bool - showPIDS bool - commands map[string]intstr.IntOrString + useColor bool + showPIDS bool + useThreads bool + commands map[string]intstr.IntOrString } // Name returns the metric name @@ -76,6 +77,10 @@ func (m *PidStat) SetOptions(metric *api.Metric) { if ok { m.showPIDS = true } + _, ok = metric.Options["threads"] + if ok { + m.useThreads = true + } // Parse map options commands, ok := metric.MapOptions["commands"] @@ -91,9 +96,22 @@ func (m PidStat) ReplicatedJobs(spec *api.MetricSet) ([]jobset.ReplicatedJob, er // Exported options and list options func (m PidStat) Options() map[string]intstr.IntOrString { + + // Prepare bool options + showPIDS := "false" + if m.showPIDS { + showPIDS = "true" + } + useThreads := "false" + if m.useThreads { + useThreads = "true" + } + return map[string]intstr.IntOrString{ "rate": intstr.FromInt(int(m.rate)), "completions": intstr.FromInt(int(m.completions)), + "threads": intstr.FromString(useThreads), + "pids": intstr.FromString(showPIDS), } } func (m PidStat) ListOptions() map[string][]intstr.IntOrString { @@ -158,16 +176,23 @@ func (m PidStat) EntrypointScripts( showPIDS = "ps aux\npstree ${pid}" } + useThreads := "" + if m.useThreads { + useThreads = " -t " + } // Prepare custom logic to determine command command := m.prepareIndexedCommand(spec) template := `#!/bin/bash echo "%s" # Download the wait binary -wget https://github.com/converged-computing/goshare/releases/download/2023-07-27/wait +wget https://github.com/converged-computing/goshare/releases/download/2023-07-27/wait > /dev/null chmod +x ./wait mv ./wait /usr/bin/goshare-wait +# Do we want to use threads? +threads="%s" + # This is logic to determine the command, it will set $command # We do this because command to watch can vary between worker pods %s @@ -190,46 +215,47 @@ while true echo "%s" %s echo "CPU STATISTICS TASK" - pidstat -p ${pid} -u -h -T TASK | jc --pidstat + pidstat -p ${pid} -u -h $threads -T TASK | jc --pidstat echo "CPU STATISTICS CHILD" - pidstat -p ${pid} -u -h -T CHILD | jc --pidstat + pidstat -p ${pid} -u -h $threads -T CHILD | jc --pidstat echo "KERNEL STATISTICS" - pidstat -p ${pid} -d -h -T ALL | jc --pidstat + pidstat -p ${pid} -d -h $threads -T ALL | jc --pidstat echo "POLICY" - pidstat -p ${pid} -R -h -T ALL | jc --pidstat + pidstat -p ${pid} -R -h $threads -T ALL | jc --pidstat echo "PAGEFAULTS TASK" - pidstat -p ${pid} -r -h -T TASK | jc --pidstat + pidstat -p ${pid} -r -h $threads -T TASK | jc --pidstat echo "PAGEFAULTS CHILD" - pidstat -p ${pid} -r -h -T CHILD | jc --pidstat + pidstat -p ${pid} -r -h $threads -T CHILD | jc --pidstat echo "STACK UTILIZATION" - pidstat -p ${pid} -s -h -T ALL | jc --pidstat + pidstat -p ${pid} -s -h $threads -T ALL | jc --pidstat echo "THREADS TASK" - pidstat -p ${pid} -t -h -T TASK | jc --pidstat + pidstat -p ${pid} -t -h $threads -T TASK | jc --pidstat echo "THREADS CHILD" - pidstat -p ${pid} -t -h -T CHILD | jc --pidstat + pidstat -p ${pid} -t -h $threads -T CHILD | jc --pidstat echo "KERNEL TABLES" - pidstat -p ${pid} -v -h -T ALL | jc --pidstat + pidstat -p ${pid} -v -h $threads -T ALL | jc --pidstat echo "TASK SWITCHING" - pidstat -p ${pid} -w -h -T ALL | jc --pidstat + pidstat -p ${pid} -w -h $threads -T ALL | jc --pidstat # Check if still running ps -p ${pid} > /dev/null retval=$? - if [[ $retval -ne 0 ]]; then + if [[ $retval -ne 0 ]]; then echo "%s" exit 0 fi - if [[ $completions -ne 0 ]] && [[ $i -eq $completions ]]; then - echo "%s" + if [[ $completions -ne 0 ]] && [[ $i -eq $completions ]]; then + echo "%s" exit 0 fi - sleep %d - let i=i+1 + sleep %d + let i=i+1 done ` script := fmt.Sprintf( template, metadata, + useThreads, command, useColor, m.completions, diff --git a/pkg/metrics/resources.go b/pkg/metrics/resources.go new file mode 100644 index 0000000..bd2df96 --- /dev/null +++ b/pkg/metrics/resources.go @@ -0,0 +1,93 @@ +/* +Copyright 2022-2023 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + +SPDX-License-Identifier: MIT +*/ + +package metrics + +import ( + "fmt" + + api "github.com/converged-computing/metrics-operator/api/v1alpha1" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/util/intstr" +) + +// getResourceGroup can return a ResourceList for either requests or limits +func getResourceGroup(items api.ContainerResource) (corev1.ResourceList, error) { + + logger.Info("🍅️ Resource", "items", items) + list := corev1.ResourceList{} + for key, unknownValue := range items { + if unknownValue.Type == intstr.Int { + + value := unknownValue.IntVal + logger.Info("🍅️ ResourceKey", "Key", key, "Value", value) + limit, err := resource.ParseQuantity(fmt.Sprintf("%d", value)) + if err != nil { + return list, err + } + + if key == "memory" { + list[corev1.ResourceMemory] = limit + } else if key == "cpu" { + list[corev1.ResourceCPU] = limit + } else { + list[corev1.ResourceName(key)] = limit + } + } else if unknownValue.Type == intstr.String { + + value := unknownValue.StrVal + logger.Info("🍅️ ResourceKey", "Key", key, "Value", value) + if key == "memory" { + list[corev1.ResourceMemory] = resource.MustParse(value) + } else if key == "cpu" { + list[corev1.ResourceCPU] = resource.MustParse(value) + } else { + list[corev1.ResourceName(key)] = resource.MustParse(value) + } + } + } + return list, nil +} + +// getContainerResources determines if any resources are requested via the spec +func getContainerResources(spec *api.ContainerResources) (corev1.ResourceRequirements, error) { + + // memory int, setCPURequest, setCPULimit, setGPULimit int64 + resources := corev1.ResourceRequirements{} + + // Limits + limits, err := getResourceGroup(spec.Limits) + if err != nil { + logger.Error("🍅️ Resources for Container.Limits", err.Error()) + return resources, err + } + resources.Limits = limits + + // Requests + requests, err := getResourceGroup(spec.Requests) + if err != nil { + logger.Error("🍅️ Resources for Container.Requests", err.Error()) + return resources, err + } + resources.Requests = requests + return resources, nil + +} + +// getPodResources determines if any resources are requested via the spec +func getPodResources(set *api.MetricSet) (corev1.ResourceList, error) { + + // memory int, setCPURequest, setCPULimit, setGPULimit int64 + resources, err := getResourceGroup(set.Spec.Resources) + if err != nil { + logger.Error("🍅️ Resources for Pod.Resources", err.Error()) + return resources, err + } + return resources, nil +} diff --git a/pkg/metrics/storage.go b/pkg/metrics/storage.go index 1255e3e..3bce5da 100644 --- a/pkg/metrics/storage.go +++ b/pkg/metrics/storage.go @@ -22,7 +22,10 @@ func (m *StorageMetricSet) ReplicatedJobs(spec *api.MetricSet) ([]jobset.Replica // Storage metrics do not need to share the process namespace // The jobname empty string will use the default - job := GetReplicatedJob(spec, false, spec.Spec.Pods, spec.Spec.Completions, "") + job, err := GetReplicatedJob(spec, false, spec.Spec.Pods, spec.Spec.Completions, "") + if err != nil { + return rjs, err + } // Add volumes expecting an application. // A storage app is required to have a volume diff --git a/sdk/python/v1alpha1/CHANGELOG.md b/sdk/python/v1alpha1/CHANGELOG.md index 765a742..961315b 100644 --- a/sdk/python/v1alpha1/CHANGELOG.md +++ b/sdk/python/v1alpha1/CHANGELOG.md @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are: The versions coincide with releases on pip. Only major versions will be released as tags on Github. ## [0.0.x](https://github.com/converged-computing/metrics-operator/tree/main) (0.0.x) + - resources specification added and tweaks to perf-sysstat (0.0.14) - pidstat python parser and better support for metric in Go (0.0.13) - Separation of parsing logs into separate metric module functions (0.0.12.1) - Support for Netmark parser and plotting example (0.0.12) diff --git a/sdk/python/v1alpha1/setup.py b/sdk/python/v1alpha1/setup.py index 7d51256..a0d2f9e 100644 --- a/sdk/python/v1alpha1/setup.py +++ b/sdk/python/v1alpha1/setup.py @@ -30,7 +30,7 @@ if __name__ == "__main__": setup( name="metricsoperator", - version="0.0.13", + version="0.0.14", author="Vanessasaurus", author_email="vsoch@users.noreply.github.com", maintainer="Vanessasaurus",