Skip to content

Commit e8357e7

Browse files
craig[bot]shailendra-patel
andcommitted
126925: roachprod: implement spot vm preemption detection for AWS. r=DarrylWong,renatolabs a=shailendra-patel Spot vm preemption detection is not implemented for AWS. Implemented spot vm detection for AWS using ec2 describe-instances and CloudTrail events. Whenver a spot vm is evicted by AWS instance description will have information related to isntance-state, instance-lifecycle and state-reason-code. If describe instance is called after 1 hour of instance termination then there is a fallback on cloud trail events. Fixes: cockroachdb#126917 Epic: CRDB-10428 Release note: None Co-authored-by: Shailendra Patel <shailendra.patel@cockroachlabs.com>
2 parents 69d5767 + f4c1ce9 commit e8357e7

File tree

3 files changed

+73
-5
lines changed

3 files changed

+73
-5
lines changed

pkg/cmd/roachtest/spec/cluster_spec.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ func awsMachineSupportsSSD(machineType string) bool {
212212
}
213213

214214
func getAWSOpts(
215-
machineType string, zones []string, volumeSize, ebsThroughput int, localSSD bool,
215+
machineType string, zones []string, volumeSize, ebsThroughput int, localSSD bool, useSpotVMs bool,
216216
) vm.ProviderOpts {
217217
opts := aws.DefaultProviderOpts()
218218
if volumeSize != 0 {
@@ -232,6 +232,7 @@ func getAWSOpts(
232232
if len(zones) != 0 {
233233
opts.CreateZones = zones
234234
}
235+
opts.UseSpot = useSpotVMs
235236
return opts
236237
}
237238

@@ -492,9 +493,9 @@ func (s *ClusterSpec) RoachprodOpts(
492493
switch cloud {
493494
case AWS:
494495
providerOpts = getAWSOpts(machineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
495-
createVMOpts.SSDOpts.UseLocalSSD)
496+
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
496497
workloadProviderOpts = getAWSOpts(workloadMachineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
497-
createVMOpts.SSDOpts.UseLocalSSD)
498+
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
498499
case GCE:
499500
providerOpts = getGCEOpts(machineType, zones, s.VolumeSize, ssdCount,
500501
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,

pkg/roachprod/vm/aws/aws.go

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"math/rand"
1919
"os"
2020
"os/exec"
21+
"regexp"
2122
"slices"
2223
"strconv"
2324
"strings"
@@ -267,13 +268,66 @@ type Provider struct {
267268
}
268269

269270
func (p *Provider) SupportsSpotVMs() bool {
270-
return false
271+
return true
271272
}
272273

273274
func (p *Provider) GetPreemptedSpotVMs(
274275
l *logger.Logger, vms vm.List, since time.Time,
275276
) ([]vm.PreemptedVM, error) {
276-
return nil, nil
277+
byRegion, err := regionMap(vms)
278+
if err != nil {
279+
return nil, err
280+
}
281+
282+
var preemptedVMs []vm.PreemptedVM
283+
for region, vmList := range byRegion {
284+
args := []string{
285+
"ec2", "describe-instances",
286+
"--region", region,
287+
"--instance-ids",
288+
}
289+
args = append(args, vmList.ProviderIDs()...)
290+
var describeInstancesResponse DescribeInstancesOutput
291+
err = p.runJSONCommand(l, args, &describeInstancesResponse)
292+
if err != nil {
293+
// if the describe-instances operation fails with the error InvalidInstanceID.NotFound,
294+
// we assume that the instance has been preempted and describe-instances operation is attempted one hour after the instance termination
295+
if strings.Contains(err.Error(), "InvalidInstanceID.NotFound") {
296+
l.Errorf("WARNING: received NotFound error when trying to find preemptions: %v", err)
297+
return vm.CreatePreemptedVMs(getInstanceIDsNotFound(err.Error())), nil
298+
}
299+
return nil, err
300+
}
301+
302+
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/finding-an-interrupted-Spot-Instance.html
303+
for _, r := range describeInstancesResponse.Reservations {
304+
for _, instance := range r.Instances {
305+
if instance.InstanceLifecycle == "spot" &&
306+
instance.State.Name == "terminated" &&
307+
instance.StateReason.Code == "Server.SpotInstanceTermination" {
308+
preemptedVMs = append(preemptedVMs, vm.PreemptedVM{Name: instance.InstanceID})
309+
}
310+
}
311+
}
312+
}
313+
314+
return preemptedVMs, nil
315+
}
316+
317+
// getInstanceIDsNotFound returns a list of instance IDs that were not found during the describe-instances command.
318+
//
319+
// Sample error message:
320+
//
321+
// ‹An error occurred (InvalidInstanceID.NotFound) when calling the DescribeInstances operation: The instance IDs 'i-02e9adfac0e5fa18f, i-0bc7869fda0299caa' do not exist›
322+
func getInstanceIDsNotFound(errorMsg string) []string {
323+
// Regular expression pattern to find instance IDs between single quotes
324+
re := regexp.MustCompile(`'([^']*)'`)
325+
matches := re.FindStringSubmatch(errorMsg)
326+
if len(matches) > 1 {
327+
instanceIDsStr := matches[1]
328+
return strings.Split(instanceIDsStr, ", ")
329+
}
330+
return nil
277331
}
278332

279333
func (p *Provider) GetHostErrorVMs(
@@ -982,6 +1036,10 @@ type DescribeInstancesOutput struct {
9821036
Code int
9831037
Name string
9841038
}
1039+
StateReason struct {
1040+
Code string `json:"Code"`
1041+
Message string `json:"Message"`
1042+
} `json:"StateReason"`
9851043
RootDeviceName string
9861044

9871045
BlockDeviceMappings []struct {

pkg/roachprod/vm/vm.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,15 @@ type PreemptedVM struct {
452452
PreemptedAt time.Time
453453
}
454454

455+
// CreatePreemptedVMs returns a list of PreemptedVM created from given list of vmNames
456+
func CreatePreemptedVMs(vmNames []string) []PreemptedVM {
457+
preemptedVMs := make([]PreemptedVM, len(vmNames))
458+
for i, name := range vmNames {
459+
preemptedVMs[i] = PreemptedVM{Name: name}
460+
}
461+
return preemptedVMs
462+
}
463+
455464
// ServiceAddress stores the IP and port of a service.
456465
type ServiceAddress struct {
457466
IP string

0 commit comments

Comments
 (0)