Skip to content
This repository has been archived by the owner on Jan 8, 2024. It is now read-only.

Papercuts in kubernetes server install #2412

Merged
merged 5 commits into from
Oct 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changelog/2412.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
```release-note:improvement
cli/serverinstall/k8s: Fix a problem where deployments would be marked as "Degraded", but were actually fine.
```

```release-note:improvement
cli/serverinstall/k8s: Add new cluster role and binding to allow nodeport services to work
```

16 changes: 5 additions & 11 deletions builtin/k8s/platform.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,17 +185,11 @@ func (p *Platform) resourceDeploymentStatus(
}
}

var deployHealth sdk.StatusReport_Health
switch mostRecentCondition.Type {
case v1.DeploymentAvailable:
deployHealth = sdk.StatusReport_READY
case v1.DeploymentProgressing:
deployHealth = sdk.StatusReport_ALIVE
case v1.DeploymentReplicaFailure:
deployHealth = sdk.StatusReport_DOWN
default:
deployHealth = sdk.StatusReport_UNKNOWN
}
// The most recently updated condition isn't always the most pertinent - a healthy deployment
// can have a "Progressing" most recently updated condition at steady-state.
// If the deployment exists, we'll mark it as "Ready", and rely on our pod status checks
// to give more detailed status.
deployHealth := sdk.StatusReport_READY

// Redact env vars from containers - they can contain secrets
for i := 0; i < len(deployResp.Spec.Template.Spec.Containers); i++ {
Expand Down
14 changes: 9 additions & 5 deletions builtin/k8s/releaser.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
"github.com/hashicorp/waypoint-plugin-sdk/terminal"
)

// The port that a service will forward to the pod(s)
// DefaultPort is the port that a service will forward to the pod(s)
const DefaultPort = 80

// Releaser is the ReleaseManager implementation for Kubernetes.
Expand Down Expand Up @@ -320,11 +320,15 @@ func (r *Releaser) resourceServiceCreate(
nodeclient := clientSet.CoreV1().Nodes()
nodes, err := nodeclient.List(ctx, metav1.ListOptions{})
if err != nil {
return err
// Rather than fail the whole release, report the error and then complete.
// Print in a standalone step, so the output won't get overwritten if we add more step output later in the future.
errStep := sg.Add("Cannot determine release URL for nodeport service due to failure to list nodes: %s", err)
errStep.Status(terminal.StatusError)
errStep.Done()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: should we use the included log here and log this error as well? I don't see log used much (or at all?) in this method, but if we're not returning the error we may only be showing it in a UI in this case, right? Just a thought, as mentioned I don't see log used much in this method anyway

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a bunch of places where I think ui messages would be super helpful in logs - honestly, i've been meaning to solve this eventually with #2075, so i've been reticent to duplicate messages between ui and the logs. If you think this is an especially important one though, I can see putting a log here.

} else {
nodeIP := nodes.Items[0].Status.Addresses[0].Address
result.Url = fmt.Sprintf("http://%s:%d", nodeIP, service.Spec.Ports[0].NodePort)
}

nodeIP := nodes.Items[0].Status.Addresses[0].Address
result.Url = fmt.Sprintf("http://%s:%d", nodeIP, service.Spec.Ports[0].NodePort)
} else {
result.Url = fmt.Sprintf("http://%s:%d", service.Spec.ClusterIP, service.Spec.Ports[0].Port)
}
Expand Down
2 changes: 1 addition & 1 deletion builtin/k8s/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ task {
return doc, nil
}

// TaskLauncher implements Configurable
// Config implements Configurable
func (p *TaskLauncher) Config() (interface{}, error) {
return &p.config, nil
}
Expand Down
2 changes: 1 addition & 1 deletion internal/cli/runner_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ type RunnerAgentCommand struct {
flagODR bool
}

// This is how long a runner in ODR mode will wait for it's job assignment before
// This is how long a runner in ODR mode will wait for its job assignment before
// timing out.
var defaultRunnerODRAcceptTimeout = 60 * time.Second

Expand Down
70 changes: 69 additions & 1 deletion internal/serverinstall/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"time"

"github.com/ghodss/yaml"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
appsv1 "k8s.io/api/apps/v1"
apiv1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
Expand Down Expand Up @@ -309,8 +311,12 @@ func (i *K8sInstaller) Install(
// Ensure the service is ready to use before returning
_, err = net.DialTimeout("tcp", httpAddr, 1*time.Second)
if err != nil {
// Depending on the platform, this can take a long time. On EKS, it's by far the longest step. Adding an explicit message helps
s.Update("Service %q exists and is configured, but isn't yet accepting incoming connections. Waiting...", serviceName)
izaaklauer marked this conversation as resolved.
Show resolved Hide resolved
return false, nil
}

s.Update("Service %q is ready", serviceName)
log.Info("http server ready", "httpAddr", addr)

// Set our advertise address
Expand Down Expand Up @@ -1358,6 +1364,43 @@ func newServiceAccount(c k8sConfig) (*apiv1.ServiceAccount, error) {
}, nil
}

// newServiceAccountClusterRoleWithBinding creates the cluster role and binding necessary to create and verify
// a nodeport type services.
func newServiceAccountClusterRoleWithBinding(c k8sConfig) (*rbacv1.ClusterRole, *rbacv1.ClusterRoleBinding, error) {
roleName := "waypoint-runner"
izaaklauer marked this conversation as resolved.
Show resolved Hide resolved
return &rbacv1.ClusterRole{
ObjectMeta: metav1.ObjectMeta{
Name: roleName,
},
Rules: []rbacv1.PolicyRule{{
APIGroups: []string{""},
Resources: []string{"nodes"},
Verbs: []string{"get", "list"},
}},
}, &rbacv1.ClusterRoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: roleName,
},

// Our default runner role is just the default "edit" role. This
// gives access to read/write most things in this namespace but
// disallows modifying roles and rolebindings.
RoleRef: rbacv1.RoleRef{
APIGroup: "",
Kind: "ClusterRole",
Name: roleName,
},

Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
Name: c.odrServiceAccount,
Namespace: c.namespace,
},
},
}, nil
}

// newServiceAccountRoleBinding creates the role binding necessary to
// map the ODR role to the service account.
func newServiceAccountRoleBinding(c k8sConfig) (*rbacv1.RoleBinding, error) {
Expand Down Expand Up @@ -1671,7 +1714,7 @@ func (i *K8sInstaller) initServiceAccount(
}

// Setup the role binding
s.Update("Initializing role binding for on-demand runner...")
s.Update("Initializing role bindings for on-demand runner...")
rbClient := clientset.RbacV1().RoleBindings(i.config.namespace)
rb, err := newServiceAccountRoleBinding(i.config)
if err != nil {
Expand All @@ -1690,6 +1733,31 @@ func (i *K8sInstaller) initServiceAccount(
return err
}

cr, crb, err := newServiceAccountClusterRoleWithBinding(i.config)
if err != nil {
return status.Errorf(codes.Internal, "Failed to get definition for runner service account's cluster role and binding: %q", err)
}
if cr != nil {
crClient := clientset.RbacV1().ClusterRoles()
_, err = crClient.Get(ctx, cr.Name, metav1.GetOptions{})
if err != nil && !errors.IsNotFound(err) {
return status.Errorf(codes.Internal, "Failed to get cluster role %q: %q", cr.Name, err)
}
if _, err := crClient.Create(ctx, cr, metav1.CreateOptions{}); err != nil {
return status.Errorf(codes.Internal, "Failed to create cluster role %q: %q", cr.Name, err)
}
}
if crb != nil {
crbClient := clientset.RbacV1().ClusterRoleBindings()
_, err = crbClient.Get(ctx, crb.Name, metav1.GetOptions{})
if err != nil && !errors.IsNotFound(err) {
return status.Errorf(codes.Internal, "Failed to get cluster role binding %q: %q", crb.Name, err)
}
if _, err := crbClient.Create(ctx, crb, metav1.CreateOptions{}); err != nil {
return status.Errorf(codes.Internal, "Failed to create cluster role binding %q: %q", cr.Name, err)
}
}

s.Update("Service account for on-demand runner initialized!")
s.Done()
return nil
Expand Down