Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve observability of API statuses #2368

Merged
merged 46 commits into from
Jul 27, 2021
Merged
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
6485723
Limit istiod to 5 replicas and the default namespace
RobertLucian Jul 14, 2021
9e5dfaf
Move prometheus exporters and fluent bit to their dedicated namespace
RobertLucian Jul 14, 2021
060ebb9
Change the HPA's targets a bit
RobertLucian Jul 14, 2021
d65fafc
Redirect kubectl's output to /dev/null
RobertLucian Jul 14, 2021
4b96c20
Add logging namespace
RobertLucian Jul 14, 2021
0677208
Remove unnecessary namespace
RobertLucian Jul 14, 2021
bbcf4e3
Fixes
RobertLucian Jul 14, 2021
da66f7f
Refactoring a bit
RobertLucian Jul 14, 2021
56554fb
Fixes
RobertLucian Jul 14, 2021
77dc2e9
More refactoring
RobertLucian Jul 14, 2021
50249d8
Event exporter must be in logging namespace
RobertLucian Jul 14, 2021
23918df
Use consts where possible
RobertLucian Jul 14, 2021
a287457
Have all namespaces in a single yaml & revert temporary change
RobertLucian Jul 14, 2021
34045d4
Patch instead of applying the default namespace
RobertLucian Jul 15, 2021
7c13445
Merge branch 'master' into fix/istiod-oom
RobertLucian Jul 15, 2021
b24dff9
Fix waiting on the load balancer stage
RobertLucian Jul 15, 2021
6c9421d
More namespace fixes
RobertLucian Jul 15, 2021
7718df5
Merge branch 'master' into fix/istiod-oom
RobertLucian Jul 15, 2021
4f4daef
Create additional-scrape-configs in prometheus ns
RobertLucian Jul 15, 2021
2d572aa
Prometheus' service must be in prometheus ns
RobertLucian Jul 15, 2021
fde6ab0
Fix cortex cluster health cmd panicking
RobertLucian Jul 15, 2021
4c6b670
Fix getPodMemorySaturation function instead
RobertLucian Jul 15, 2021
042b2b3
Report live (x/y) and up-to-date replicas
RobertLucian Jul 15, 2021
dd50f2b
WIP on upgrading the statuses
RobertLucian Jul 15, 2021
5618e96
Merge branch 'master' into feature/better-api-statuses
RobertLucian Jul 20, 2021
4f9dc7c
WIP on API statuses
RobertLucian Jul 21, 2021
4cde8f4
WIP on API statuses
RobertLucian Jul 21, 2021
d7ca13f
WIP on API statuses
RobertLucian Jul 21, 2021
34dc5a3
Changes to the TrafficSplitter
RobertLucian Jul 22, 2021
51606a7
WIP on API statuses
RobertLucian Jul 23, 2021
fb15122
Fixes
RobertLucian Jul 23, 2021
943621a
Merge branch 'master' into feature/better-api-statuses
RobertLucian Jul 23, 2021
c5b0e73
Fix terminating status
RobertLucian Jul 23, 2021
8edee7a
Fix the worker counts for the batch jobs
RobertLucian Jul 26, 2021
a0ec6df
Output type (yaml) fixes
RobertLucian Jul 26, 2021
e2b6404
Fix
RobertLucian Jul 26, 2021
f2bc8bb
Update docs
RobertLucian Jul 26, 2021
af364a6
Address PR comments
RobertLucian Jul 27, 2021
12e3121
Merge branch 'master' into feature/better-api-statuses
RobertLucian Jul 27, 2021
f891aa3
Docs updates
RobertLucian Jul 27, 2021
121a669
Update statuses.md
deliahu Jul 27, 2021
c337f29
Update statuses.md
deliahu Jul 27, 2021
70a6132
Update statuses.md
deliahu Jul 27, 2021
4e0dc14
Update statuses.md
deliahu Jul 27, 2021
ce6a89f
Address PR comments
RobertLucian Jul 27, 2021
50f6304
Merge branch 'master' into feature/better-api-statuses
RobertLucian Jul 27, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli/cluster/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/json"
"github.com/cortexlabs/cortex/pkg/lib/pointer"
"github.com/cortexlabs/cortex/pkg/lib/prompt"
s "github.com/cortexlabs/cortex/pkg/lib/strings"
"github.com/cortexlabs/cortex/pkg/operator/schema"
Expand Down Expand Up @@ -70,8 +71,7 @@ func getReadyRealtimeAPIReplicasOrNil(operatorConfig OperatorConfig, apiName str
return nil
}

totalReady := apiRes.Status.Updated.Ready + apiRes.Status.Stale.Ready
return &totalReady
return pointer.Int32(apiRes.Status.Ready)
}

func StopJob(operatorConfig OperatorConfig, kind userconfig.Kind, apiName string, jobID string) (schema.DeleteResponse, error) {
Expand Down
14 changes: 14 additions & 0 deletions cli/cluster/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ func GetAPI(operatorConfig OperatorConfig, apiName string) ([]schema.APIResponse
return apiRes, nil
}

func DescribeAPI(operatorConfig OperatorConfig, apiName string) ([]schema.APIResponse, error) {
httpRes, err := HTTPGet(operatorConfig, "/describe/"+apiName)
if err != nil {
return nil, err
}

var apiRes []schema.APIResponse
if err = json.Unmarshal(httpRes, &apiRes); err != nil {
return nil, errors.Wrap(err, "/describe/"+apiName, string(httpRes))
}

return apiRes, nil
}

func GetAPIByID(operatorConfig OperatorConfig, apiName string, apiID string) ([]schema.APIResponse, error) {
httpRes, err := HTTPGet(operatorConfig, "/get/"+apiName+"/"+apiID)
if err != nil {
Expand Down
113 changes: 113 additions & 0 deletions cli/cmd/describe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
Copyright 2021 Cortex Labs, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cmd

import (
"fmt"

"github.com/cortexlabs/cortex/cli/cluster"
"github.com/cortexlabs/cortex/cli/types/cliconfig"
"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/exit"
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
"github.com/cortexlabs/cortex/pkg/types/userconfig"
"github.com/spf13/cobra"
)

const (
_titleReplicaStatus = "replica status"
_titleReplicaCount = "replica count"
)

var (
_flagDescribeEnv string
_flagDescribeWatch bool
)

func describeInit() {
_describeCmd.Flags().SortFlags = false
_describeCmd.Flags().StringVarP(&_flagDescribeEnv, "env", "e", "", "environment to use")
_describeCmd.Flags().BoolVarP(&_flagDescribeWatch, "watch", "w", false, "re-run the command every 2 seconds")
}

var _describeCmd = &cobra.Command{
Use: "describe [API_NAME]",
Short: "describe an api",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
apiName := args[0]

var envName string
if wasFlagProvided(cmd, "env") {
envName = _flagDescribeEnv
} else {
var err error
envName, err = getEnvFromFlag("")
if err != nil {
telemetry.Event("cli.describe")
exit.Error(err)
}
}

env, err := ReadOrConfigureEnv(envName)
if err != nil {
telemetry.Event("cli.describe")
exit.Error(err)
}
telemetry.Event("cli.describe", map[string]interface{}{"env_name": env.Name})

rerun(_flagDescribeWatch, func() (string, error) {
env, err := ReadOrConfigureEnv(envName)
if err != nil {
exit.Error(err)
}

out, err := envStringIfNotSpecified(envName, cmd)
if err != nil {
return "", err
}
apiTable, err := describeAPI(env, apiName)
if err != nil {
return "", err
}

return out + apiTable, nil
})
},
}

func describeAPI(env cliconfig.Environment, apiName string) (string, error) {
apisRes, err := cluster.DescribeAPI(MustGetOperatorConfig(env.Name), apiName)
if err != nil {
return "", err
}

if len(apisRes) == 0 {
exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find api %s", apiName)))
}

apiRes := apisRes[0]

switch apiRes.Metadata.Kind {
case userconfig.RealtimeAPIKind:
return realtimeDescribeAPITable(apiRes, env)
case userconfig.AsyncAPIKind:
return asyncDescribeAPITable(apiRes, env)
default:
return "", errors.ErrorUnexpected(fmt.Sprintf("encountered unexpected kind %s for api %s", apiRes.Spec.Kind, apiRes.Spec.Name))
}
}
67 changes: 40 additions & 27 deletions cli/cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,29 +35,28 @@ import (
libtime "github.com/cortexlabs/cortex/pkg/lib/time"
"github.com/cortexlabs/cortex/pkg/operator/schema"
"github.com/cortexlabs/cortex/pkg/types/userconfig"
"github.com/cortexlabs/yaml"
"github.com/spf13/cobra"
)

const (
_titleEnvironment = "env"
_titleRealtimeAPI = "realtime api"
_titleStatus = "status"
_titleAsyncAPI = "async api"
_titleLive = "live"
_titleUpToDate = "up-to-date"
_titleStale = "stale"
_titleRequested = "requested"
_titleFailed = "failed"
_titleLastupdated = "last update"
_titleLastUpdated = "last update"
)

var (
_flagGetEnv string
_flagWatch bool
_flagGetEnv string
_flagGetWatch bool
)

func getInit() {
_getCmd.Flags().SortFlags = false
_getCmd.Flags().StringVarP(&_flagGetEnv, "env", "e", "", "environment to use")
_getCmd.Flags().BoolVarP(&_flagWatch, "watch", "w", false, "re-run the command every 2 seconds")
_getCmd.Flags().BoolVarP(&_flagGetWatch, "watch", "w", false, "re-run the command every 2 seconds")
_getCmd.Flags().VarP(&_flagOutput, "output", "o", fmt.Sprintf("output format: one of %s", strings.Join(flags.OutputTypeStringsExcluding(flags.YAMLOutputType), "|")))
addVerboseFlag(_getCmd)
}
Expand Down Expand Up @@ -90,7 +89,7 @@ var _getCmd = &cobra.Command{
telemetry.Event("cli.get")
}

rerun(func() (string, error) {
rerun(_flagGetWatch, func() (string, error) {
if len(args) == 1 {
env, err := ReadOrConfigureEnv(envName)
if err != nil {
Expand All @@ -106,7 +105,7 @@ var _getCmd = &cobra.Command{
return "", err
}

if _flagOutput == flags.JSONOutputType {
if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType {
return apiTable, nil
}

Expand Down Expand Up @@ -136,7 +135,7 @@ var _getCmd = &cobra.Command{
if err != nil {
return "", err
}
if _flagOutput == flags.JSONOutputType {
if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType {
return jobTable, nil
}

Expand Down Expand Up @@ -166,7 +165,7 @@ var _getCmd = &cobra.Command{
return "", err
}

if _flagOutput == flags.JSONOutputType {
if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType {
return apiTable, nil
}

Expand Down Expand Up @@ -221,7 +220,7 @@ func getAPIsInAllEnvironments() (string, error) {

if err == nil {
for _, api := range apisRes {
switch api.Spec.Kind {
switch api.Metadata.Kind {
case userconfig.BatchAPIKind:
allBatchAPIEnvs = append(allBatchAPIEnvs, env.Name)
allBatchAPIs = append(allBatchAPIs, api)
Expand All @@ -247,12 +246,16 @@ func getAPIsInAllEnvironments() (string, error) {
allAPIsOutput = append(allAPIsOutput, apisOutput)
}

var bytes []byte
if _flagOutput == flags.JSONOutputType {
bytes, err := libjson.Marshal(allAPIsOutput)
if err != nil {
return "", err
}

bytes, err = libjson.Marshal(allAPIsOutput)
} else if _flagOutput == flags.YAMLOutputType {
bytes, err = yaml.Marshal(allAPIsOutput)
}
if err != nil {
return "", err
}
if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType {
return string(bytes), nil
}

Expand Down Expand Up @@ -337,11 +340,16 @@ func getAPIsByEnv(env cliconfig.Environment) (string, error) {
return "", err
}

var bytes []byte
if _flagOutput == flags.JSONOutputType {
bytes, err := libjson.Marshal(apisRes)
if err != nil {
return "", err
}
bytes, err = libjson.Marshal(apisRes)
} else if _flagOutput == flags.YAMLOutputType {
bytes, err = yaml.Marshal(apisRes)
}
if err != nil {
return "", err
}
if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType {
return string(bytes), nil
}

Expand Down Expand Up @@ -457,16 +465,21 @@ func getAPI(env cliconfig.Environment, apiName string) (string, error) {
return "", err
}

var bytes []byte
if _flagOutput == flags.JSONOutputType {
bytes, err := libjson.Marshal(apisRes)
if err != nil {
return "", err
}
bytes, err = libjson.Marshal(apisRes)
} else if _flagOutput == flags.YAMLOutputType {
bytes, err = yaml.Marshal(apisRes)
}
if err != nil {
return "", err
}
if _flagOutput == flags.JSONOutputType || _flagOutput == flags.YAMLOutputType {
return string(bytes), nil
}

if len(apisRes) == 0 {
exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find API %s", apiName)))
exit.Error(errors.ErrorUnexpected(fmt.Sprintf("unable to find api %s", apiName)))
}

apiRes := apisRes[0]
Expand Down
59 changes: 59 additions & 0 deletions cli/cmd/lib_apis.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
Copyright 2021 Cortex Labs, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cmd

import (
"github.com/cortexlabs/cortex/pkg/lib/table"
"github.com/cortexlabs/cortex/pkg/types/status"
)

func replicaCountTable(counts *status.ReplicaCounts) table.Table {
var rows [][]interface{}
for _, replicaCountType := range status.ReplicaCountTypes {
count := counts.GetCountBy(replicaCountType)
canBeHiddenIfZero := false
switch replicaCountType {
case status.ReplicaCountFailed:
canBeHiddenIfZero = true
case status.ReplicaCountKilled:
canBeHiddenIfZero = true
case status.ReplicaCountKilledOOM:
canBeHiddenIfZero = true
case status.ReplicaCountErrImagePull:
canBeHiddenIfZero = true
case status.ReplicaCountUnknown:
canBeHiddenIfZero = true
case status.ReplicaCountStalled:
canBeHiddenIfZero = true
}
if count == 0 && canBeHiddenIfZero {
continue
}
rows = append(rows, []interface{}{
replicaCountType,
count,
})
}

return table.Table{
Headers: []table.Header{
{Title: _titleReplicaStatus, MinWidth: 32, MaxWidth: 32},
{Title: _titleReplicaCount},
},
Rows: rows,
}
}
Loading