From 569a822248c01ee511b37d4bd6a44ad47266a3bd Mon Sep 17 00:00:00 2001 From: powerkimhub Date: Thu, 24 Oct 2024 15:39:50 +0900 Subject: [PATCH] Avoid infinite loop while waiting for 'DONE' operation status --- .../drivers/gcp/resources/VMHandler.go | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/cloud-control-manager/cloud-driver/drivers/gcp/resources/VMHandler.go b/cloud-control-manager/cloud-driver/drivers/gcp/resources/VMHandler.go index 673d1391..c109cae0 100644 --- a/cloud-control-manager/cloud-driver/drivers/gcp/resources/VMHandler.go +++ b/cloud-control-manager/cloud-driver/drivers/gcp/resources/VMHandler.go @@ -484,24 +484,46 @@ func (vmHandler *GCPVMHandler) StartVM(vmReqInfo irs.VMReqInfo) (irs.VMInfo, err // check operation status, wait until operation is completed // This process is required because some operations have not error message but failed. + timeoutDuration := 1 * time.Hour + timeout := time.After(timeoutDuration) + + retryCount := 0 + maxRetries := 3 + for { - result, err := vmHandler.Client.ZoneOperations.Get(projectID, zone, op.Name).Context(context.Background()).Do() - if err != nil { - cblogger.Errorf("Failed to get operation: %v", err) - } - if result.Status == "DONE" { - if result.Error != nil { - var errorMessages []string - for _, err := range result.Error.Errors { - cblogger.Errorf("Operation error: %v", err.Message) - errorMessages = append(errorMessages, err.Message) + select { + case <-timeout: + return irs.VMInfo{}, fmt.Errorf("Operation %s in project %s, zone %s timed out after %v", op.Name, projectID, zone, timeoutDuration) + default: + result, err := vmHandler.Client.ZoneOperations.Get(projectID, zone, op.Name).Context(context.Background()).Do() + if err != nil { + retryCount++ + if retryCount < maxRetries { + cblogger.Infof("Failed to get operation (retry %d/%d): %v. Retrying...", retryCount, maxRetries, err) + continue + } else { + return irs.VMInfo{}, fmt.Errorf("Failed to get operation %s in project %s, zone %s after %d retries: %v", op.Name, projectID, zone, maxRetries, err) + } + } else { + retryCount = 0 + } + + // result.Status Possible values: "DONE", "PENDING", "RUNNING" + if result.Status == "DONE" { + if result.Error != nil { + var errorMessages []string + for _, err := range result.Error.Errors { + cblogger.Errorf("Operation error: %v", err.Message) + errorMessages = append(errorMessages, err.Message) + } + combinedError := fmt.Errorf("Operation errors: %s", strings.Join(errorMessages, ", ")) + return irs.VMInfo{}, combinedError } - combinedError := fmt.Errorf("Operation errors: %s", strings.Join(errorMessages, ", ")) - return irs.VMInfo{}, combinedError + break } - break + + time.Sleep(15 * time.Second) } - time.Sleep(15 * time.Second) } /*