gitpod-io · geropl · Feb 28, 2022 · Feb 28, 2022 · Feb 28, 2022
@@ -153,6 +153,10 @@ export interface WorkspaceInstanceConditions {
 
     // stopped_by_request is true if the workspace was stopped using a StopWorkspace call
     stoppedByRequest?: boolean;
+
+    // clusterSelectionFailed is true if the instance was never actually started because it could not be assigned to any workspace cluster.
+    // Different to most other conditions, this is set by `server` during UNKNOWN/PREPARING phase.
+    clusterSelectionFailed?: boolean;
 }
 
 // AdmissionLevel describes who can access a workspace instance and its ports.

@@ -123,4 +123,26 @@ const gitpodVersionInfo = new prometheusClient.Gauge({
 
 export function setGitpodVersion(gitpod_version: string){
     gitpodVersionInfo.set({gitpod_version}, 1)
+}
+
+const instanceStartsSuccessTotal = new prometheusClient.Counter({
+    name: 'gitpod_server_instance_starts_success_total',
+    help: 'Total amount of successfully performed instance starts',
+    labelNames: ['retries'],
+    registers: [prometheusClient.register],
+});
+
+export function increaseSuccessfulInstanceStartCounter(retries: number = 0) {
+    instanceStartsSuccessTotal.inc({ retries });
+}
+
+const instanceStartsFailedTotal = new prometheusClient.Counter({
+    name: 'gitpod_server_instance_starts_failed_total',
+    help: 'Total amount of failed performed instance starts',
+    labelNames: ['reason'],
+    registers: [prometheusClient.register],
+});
+
+export function increaseFailedInstanceStartCounter(reason: "clusterSelectionFailed") {
+    instanceStartsFailedTotal.inc({ reason });
 }
@@ -36,13 +36,17 @@ import { WithReferrerContext } from "@gitpod/gitpod-protocol/lib/protocol";
 import { IDEOption } from "@gitpod/gitpod-protocol/lib/ide-protocol";
 import { Deferred } from "@gitpod/gitpod-protocol/lib/util/deferred";
 import { ExtendedUser } from "@gitpod/ws-manager/lib/constraints";
+import { increaseFailedInstanceStartCounter, increaseSuccessfulInstanceStartCounter } from "../prometheus-metrics";
 
 export interface StartWorkspaceOptions {
     rethrow?: boolean;
     forceDefaultImage?: boolean;
     excludeFeatureFlags?: NamedWorkspaceFeatureFlag[];
 }
 
+const MAX_INSTANCE_START_RETRIES = 2;
+const INSTANCE_START_RETRY_INTERVAL_SECONDS = 2;
+
 @injectable()
 export class WorkspaceStarter {
     @inject(WorkspaceManagerClientProvider) protected readonly clientProvider: WorkspaceManagerClientProvider;
@@ -151,6 +155,7 @@ export class WorkspaceStarter {
     protected async actuallyStartWorkspace(ctx: TraceContext, instance: WorkspaceInstance, workspace: Workspace, user: User, mustHaveBackup: boolean, ideConfig: IDEConfig, userEnvVars: UserEnvVar[], projectEnvVars: ProjectEnvVar[], rethrow?: boolean, forceRebuild?: boolean): Promise<StartWorkspaceResult> {
         const span = TraceContext.startSpan("actuallyStartWorkspace", ctx);
 
+        let clusterSelectionFailed = false;
         try {
             // build workspace image
             instance = await this.buildWorkspaceImage({ span }, user, workspace, instance, forceRebuild, forceRebuild);
@@ -180,45 +185,25 @@ export class WorkspaceStarter {
             const euser: ExtendedUser = {
                 ...user,
                 getsMoreResources: await this.userService.userGetsMoreResources(user),
-            }
-
-            // tell the world we're starting this instance
-            let resp: StartWorkspaceResponse.AsObject | undefined;
-            let lastInstallation = "";
-            const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance);
-            for await (let cluster of clusters) {
-                try {
-                    // getStartManager will throw an exception if there's no cluster available and hence exit the loop
-                    const { manager, installation } = cluster;
-                    lastInstallation = installation;
-
-                    instance.status.phase = "pending";
-                    instance.region = installation;
-                    await this.workspaceDb.trace({ span }).storeInstance(instance);
-                    try {
-                        await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);
-                    } catch (err) {
-                        // if sending the notification fails that's no reason to stop the workspace creation.
-                        // If the dashboard misses this event it will catch up at the next one.
-                        span.log({ "notifyOnInstanceUpdate.error": err });
-                        log.debug("cannot send instance update - this should be mostly inconsequential", err);
-                    }
+            };
 
-                    // start that thing
-                    log.info({ instanceId: instance.id }, 'starting instance');
-                    resp = (await manager.startWorkspace({ span }, startRequest)).toObject();
+            // choose a cluster and start the instance
+            let resp: StartWorkspaceResponse.AsObject | undefined = undefined;
+            let retries = 0;
+            for (; retries < MAX_INSTANCE_START_RETRIES; retries++) {
+                resp = await this.tryStartOnCluster({ span }, startRequest, euser, workspace, instance);
+                if (resp) {
                     break;
-                } catch (err: any) {
-                    if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") {
-                        log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation });
-                    } else {
-                        throw err;
-                    }
                 }
+                await new Promise((resolve) => setTimeout(resolve, INSTANCE_START_RETRY_INTERVAL_SECONDS * 1000));
             }
+
             if (!resp) {
+                clusterSelectionFailed = true;
+                increaseFailedInstanceStartCounter("clusterSelectionFailed");
                 throw new Error("cannot start a workspace because no workspace clusters are available");
             }
+            increaseSuccessfulInstanceStartCounter(retries);
 
             span.log({ "resp": resp });
 
@@ -245,7 +230,7 @@ export class WorkspaceStarter {
             return { instanceID: instance.id, workspaceURL: resp.url };
         } catch (err) {
             TraceContext.setError({ span }, err);
-            await this.failInstanceStart({ span }, err, workspace, instance);
+            await this.failInstanceStart({ span }, err, workspace, instance, clusterSelectionFailed);
 
             if (rethrow) {
                 throw err;
@@ -259,6 +244,42 @@ export class WorkspaceStarter {
         }
     }
 
+    protected async tryStartOnCluster(ctx: TraceContext, startRequest: StartWorkspaceRequest, euser: ExtendedUser, workspace: Workspace, instance: WorkspaceInstance): Promise<StartWorkspaceResponse.AsObject | undefined> {
+        let lastInstallation = "";
+        const clusters = await this.clientProvider.getStartClusterSets(euser, workspace, instance);
+        for await (let cluster of clusters) {
+            try {
+                // getStartManager will throw an exception if there's no cluster available and hence exit the loop
+                const { manager, installation } = cluster;
+                lastInstallation = installation;
+
+                instance.status.phase = "pending";
+                instance.region = installation;
+                await this.workspaceDb.trace(ctx).storeInstance(instance);
+                try {
+                    await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);
+                } catch (err) {
+                    // if sending the notification fails that's no reason to stop the workspace creation.
+                    // If the dashboard misses this event it will catch up at the next one.
+                    ctx.span?.log({ "notifyOnInstanceUpdate.error": err });
+                    log.debug("cannot send instance update - this should be mostly inconsequential", err);
+                }
+
+                // start that thing
+                log.info({ instanceId: instance.id }, 'starting instance');
+                return (await manager.startWorkspace(ctx, startRequest)).toObject();
+            } catch (err: any) {
+                if ('code' in err && err.code !== grpc.status.OK && lastInstallation !== "") {
+                    log.error({ instanceId: instance.id }, "cannot start workspace on cluster, might retry", err, { cluster: lastInstallation });
+                } else {
+                    throw err;
+                }
+            }
+        }
+
+        return undefined;
+    }
+
     protected async notifyOnPrebuildQueued(ctx: TraceContext, workspaceId: string) {
         const span = TraceContext.startSpan("notifyOnPrebuildQueued", ctx);
         const prebuild = await this.workspaceDb.trace({ span }).findPrebuildByWorkspaceID(workspaceId);
@@ -274,7 +295,7 @@ export class WorkspaceStarter {
      * failInstanceStart properly fails a workspace instance if something goes wrong before the instance ever reaches
      * workspace manager. In this case we need to make sure we also fulfil the tasks of the bridge (e.g. for prebulds).
      */
-    protected async failInstanceStart(ctx: TraceContext, err: Error, workspace: Workspace, instance: WorkspaceInstance) {
+    protected async failInstanceStart(ctx: TraceContext, err: Error, workspace: Workspace, instance: WorkspaceInstance, clusterSelectionFailed?: boolean) {
         const span = TraceContext.startSpan("failInstanceStart", ctx);
 
         try {
@@ -285,6 +306,9 @@ export class WorkspaceStarter {
             instance.stoppedTime = new Date().toISOString();
 
             instance.status.conditions.failed = err.toString();
+            if (!!clusterSelectionFailed) {
+                instance.status.conditions.clusterSelectionFailed = true;
+            }
             instance.status.message = `Workspace cannot be started: ${err}`;
             await this.workspaceDb.trace({ span }).storeInstance(instance);
             await this.messageBus.notifyOnInstanceUpdate(workspace.ownerId, instance);