Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix number of API replicas shown in cortex cluster info cmd #2140

Merged
merged 10 commits into from
May 10, 2021
8 changes: 6 additions & 2 deletions cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
numAPIInstances := len(infoResponse.NodeInfos)

var totalReplicas int
var doesClusterHaveGPUs, doesClusterHaveInfs bool
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncAPIs bool
for _, nodeInfo := range infoResponse.NodeInfos {
totalReplicas += nodeInfo.NumReplicas
if nodeInfo.ComputeUserCapacity.GPU > 0 {
Expand All @@ -896,6 +896,9 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
if nodeInfo.ComputeUserCapacity.Inf > 0 {
doesClusterHaveInfs = true
}
if nodeInfo.NumAsyncGatewayReplicas > 0 {
doesClusterHaveAsyncAPIs = true
}
}

var pendingReplicasStr string
Expand All @@ -913,6 +916,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
{Title: "instance type"},
{Title: "lifecycle"},
{Title: "replicas"},
{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncAPIs},
{Title: "CPU (requested / total allocatable)"},
{Title: "memory (requested / total allocatable)"},
{Title: "GPU (requested / total allocatable)", Hidden: !doesClusterHaveGPUs},
Expand All @@ -930,7 +934,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
memStr := nodeInfo.ComputeUserRequested.Mem.String() + " / " + nodeInfo.ComputeUserCapacity.Mem.String()
gpuStr := s.Int64(nodeInfo.ComputeUserRequested.GPU) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.GPU)
infStr := s.Int64(nodeInfo.ComputeUserRequested.Inf) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.Inf)
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, cpuStr, memStr, gpuStr, infStr})
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, cpuStr, memStr, gpuStr, infStr})
}

t := table.Table{
Expand Down
8 changes: 7 additions & 1 deletion pkg/operator/endpoints/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
pod := pods[i]

_, isAPIPod := pod.Labels["apiName"]
asyncDeploymentType, isAsyncPod := pod.Labels["cortex.dev/async"]

if pod.Spec.NodeName == "" && isAPIPod {
numPendingReplicas++
Expand All @@ -118,7 +119,12 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
}

if isAPIPod {
node.NumReplicas++
if !isAsyncPod || asyncDeploymentType == "api" {
node.NumReplicas++
}
if !isAsyncPod || asyncDeploymentType == "gateway" {
node.NumAsyncGatewayReplicas++
}
}

cpu, mem, gpu, inf := k8s.TotalPodCompute(&pod.Spec)
Expand Down
19 changes: 10 additions & 9 deletions pkg/operator/schema/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ type InfoResponse struct {
}

type NodeInfo struct {
Name string `json:"name"`
NodeGroupName string `json:"nodegroup_name"`
InstanceType string `json:"instance_type"`
IsSpot bool `json:"is_spot"`
Price float64 `json:"price"`
NumReplicas int `json:"num_replicas"`
ComputeUserCapacity userconfig.Compute `json:"compute_user_capacity"` // the total resources available to the user on a node
ComputeAvailable userconfig.Compute `json:"compute_available"` // unused resources on a node
ComputeUserRequested userconfig.Compute `json:"compute_user_requested"` // total resources requested by user on a node
Name string `json:"name"`
NodeGroupName string `json:"nodegroup_name"`
InstanceType string `json:"instance_type"`
IsSpot bool `json:"is_spot"`
Price float64 `json:"price"`
NumReplicas int `json:"num_replicas"`
NumAsyncGatewayReplicas int `json:"num_async_gateway_replicas"`
ComputeUserCapacity userconfig.Compute `json:"compute_user_capacity"` // the total resources available to the user on a node
ComputeAvailable userconfig.Compute `json:"compute_available"` // unused resources on a node
ComputeUserRequested userconfig.Compute `json:"compute_user_requested"` // total resources requested by user on a node
}

type DeployResult struct {
Expand Down