Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spot instance support #2095

Closed
wants to merge 5 commits into from
Closed
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
257 changes: 227 additions & 30 deletions builtin/providers/aws/resource_aws_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"encoding/hex"
"fmt"
"log"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -68,6 +69,16 @@ func resourceAwsInstance() *schema.Resource {
Computed: true,
},

"spot_persist": &schema.Schema{
Type: schema.TypeBool,
Optional: true,
},

"spot_price": &schema.Schema{
Type: schema.TypeString,
Optional: true,
},

"subnet_id": &schema.Schema{
Type: schema.TypeString,
Optional: true,
Expand Down Expand Up @@ -121,6 +132,16 @@ func resourceAwsInstance() *schema.Resource {
},
},

"spot_request_id": &schema.Schema{
Type: schema.TypeString,
Computed: true,
},

"spot_request_status": &schema.Schema{
Type: schema.TypeString,
Computed: true,
},

"public_dns": &schema.Schema{
Type: schema.TypeString,
Computed: true,
Expand Down Expand Up @@ -514,44 +535,147 @@ func resourceAwsInstanceCreate(d *schema.ResourceData, meta interface{}) error {
runOpts.BlockDeviceMappings = blockDevices
}

// Create the instance
log.Printf("[DEBUG] Run configuration: %#v", runOpts)
var err error

var runResp *ec2.Reservation
for i := 0; i < 5; i++ {
runResp, err = conn.RunInstances(runOpts)
if awsErr, ok := err.(awserr.Error); ok {
// IAM profiles can take ~10 seconds to propagate in AWS:
// http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#launch-instance-with-role-console
if awsErr.Code() == "InvalidParameterValue" && strings.Contains(awsErr.Message(), "Invalid IAM Instance Profile") {
log.Printf("[DEBUG] Invalid IAM Instance Profile referenced, retrying...")
time.Sleep(2 * time.Second)
continue
instanceID := ""
onDemand := true

if sp := d.Get("spot_price").(string); sp != "" {

// FIXME: Write tests for invalid instance type assert rejected
instance_type := d.Get("instance_type").(string)
if instance_type == "t2.micro" ||
instance_type == "t2.small" ||
instance_type == "t2.medium" {
return fmt.Errorf("Instance type (%s) cannot be used for a spot instance request", instance_type)
}

// FIXME: Write tests for negative bid assert rejected
// FIXME: Write tests for non decimal bid assert rejected
bid, err := strconv.ParseFloat(sp, 32)
if err == nil {
if bid < 0 {
return fmt.Errorf("Spot bid (%s) must be positive", sp)
}
} else {
return fmt.Errorf("Spot bid (%s) is not a valid decimal", sp)
}

onDemand = false

spotPlacement := &ec2.SpotPlacement{
AvailabilityZone: aws.String(d.Get("availability_zone").(string)),
GroupName: aws.String(d.Get("placement_group").(string)),
}

launchSpec := &ec2.RequestSpotLaunchSpecification{
Placement: spotPlacement,
BlockDeviceMappings: runOpts.BlockDeviceMappings,
ImageID: runOpts.ImageID,
InstanceType: runOpts.InstanceType,
UserData: runOpts.UserData,
EBSOptimized: runOpts.EBSOptimized,
IAMInstanceProfile: runOpts.IAMInstanceProfile,
NetworkInterfaces: runOpts.NetworkInterfaces,
}

if runOpts.SubnetID != nil {
launchSpec.SubnetID = runOpts.SubnetID
}

if runOpts.SecurityGroupIDs != nil {
launchSpec.SecurityGroupIDs = runOpts.SecurityGroupIDs
}

if runOpts.SecurityGroups != nil {
launchSpec.SecurityGroups = runOpts.SecurityGroups
}

if runOpts.KeyName != nil {
launchSpec.KeyName = runOpts.KeyName
}

spotType := "one-time"

if d.Get("spot_persist").(bool) {
spotType = "persistent"
}
// Build the spot instance struct
spotOpts := &ec2.RequestSpotInstancesInput{
SpotPrice: aws.String(sp),
Type: aws.String(spotType),
InstanceCount: aws.Long(1),
LaunchSpecification: launchSpec,
}

// Make the spot instance request
var spotResp *ec2.RequestSpotInstancesOutput
spotResp, err = conn.RequestSpotInstances(spotOpts)

request_id := *spotResp.SpotInstanceRequests[0].SpotInstanceRequestID
d.Set("spot_request_id", request_id)

spotStateConf := &resource.StateChangeConf{
// http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-bid-status.html
Pending: []string{"start", "pending-evaluation", "pending-fulfillment"},
Target: "fulfilled",
Refresh: SpotInstanceStateRefreshFunc(conn, request_id),
Timeout: 10 * time.Minute,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the script may wait 10 minutes for a bid to complete. You can't run terraform apply in parallel so In the mean time, you will be unable to apply other changes to your infrastructure. How likely is it that you'll wait the full 10 minutes? Can this be shorter (e.g. 10-30s)? Or configurable?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This all comes down to how fast amazon resolves your bid. Typically it's about 5 minutes or less, but I've seen it take as long as half an hour in rare cases.

Making it configurable wouldn't hurt - to be honest though this is just copy pasta. If someone decides they want a spot instance, they should be aware that it will take some time for their bid to resolve

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I actually need to wait for the bid to resolve? Or can I just post it and move on, and assume my instances may spin up later? When I read the docs on spot I got the impression that you can post a bid and walk away and AWS will boot the instances for you later if the price drops. Maybe that feature is optional?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cbednarski that's an interesting proposition.

If we don't wait for the bid to resolve, there's no guarantee that the resource will ever actually be provided, which kind of sucks. On the other hand, it would be nice to be able to put the bid in proceed irregardless of if the bid ever resolves.

What do you think about making this another switch? Something like "spot_wait" to toggle if you should wait for the instance to come up or not. I can see use cases for both.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is more Terraform-like to just wait for the bid for now. Terraform waits for things to be fully ready at a resource level. A spot instance isn't ready until it exists.

Let's get this merged and then we might want to entertain adding another flag to the resource that just tells Terraform to just wait for the bid and move on.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, I'm working moving spot bids into their own resource right now - i'll make sure that instances wait for the bid to resolve (i had made this a flag)

Delay: 10 * time.Second,
MinTimeout: 3 * time.Second,
}

log.Printf("[DEBUG] waiting for spot bid to resolve... this may take several minutes.")
spotRequestRaw, err := spotStateConf.WaitForState()
spotRequest := spotRequestRaw.(*ec2.SpotInstanceRequest)

if err != nil {
return fmt.Errorf("Error while waiting for spot request (%s) to resolve: %s", request_id, err)
} else {
instanceID = *spotRequest.InstanceID
}
break
}
if err != nil {
return fmt.Errorf("Error launching source instance: %s", err)
}

instance := runResp.Instances[0]
log.Printf("[INFO] Instance ID: %s", *instance.InstanceID)
if onDemand {
// Create the instance
log.Printf("[DEBUG] Run configuration: %#v", runOpts)
var err error

var runResp *ec2.Reservation
for i := 0; i < 5; i++ {
runResp, err = conn.RunInstances(runOpts)
if awsErr, ok := err.(awserr.Error); ok {
// IAM profiles can take ~10 seconds to propagate in AWS:
// http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#launch-instance-with-role-console
if awsErr.Code() == "InvalidParameterValue" && strings.Contains(awsErr.Message(), "Invalid IAM Instance Profile") {
log.Printf("[DEBUG] Invalid IAM Instance Profile referenced, retrying...")
time.Sleep(2 * time.Second)
continue
}
}
break
}
if err != nil {
return fmt.Errorf("Error launching source instance: %s", err)
}

// Store the resulting ID so we can look this up later
d.SetId(*instance.InstanceID)
instance := runResp.Instances[0]
log.Printf("[INFO] Instance ID: %s", *instance.InstanceID)

// Wait for the instance to become running so we can get some attributes
// that aren't available until later.
log.Printf(
"[DEBUG] Waiting for instance (%s) to become running",
*instance.InstanceID)
// Wait for the instance to become running so we can get some attributes
// that aren't available until later.
log.Printf(
"[DEBUG] Waiting for instance (%s) to become running",
*instance.InstanceID)

instanceID = *instance.InstanceID
}

// Store the resulting ID so we can look this up later
d.SetId(instanceID)

stateConf := &resource.StateChangeConf{
Pending: []string{"pending"},
Target: "running",
Refresh: InstanceStateRefreshFunc(conn, *instance.InstanceID),
Refresh: InstanceStateRefreshFunc(conn, instanceID),
Timeout: 10 * time.Minute,
Delay: 10 * time.Second,
MinTimeout: 3 * time.Second,
Expand All @@ -561,10 +685,10 @@ func resourceAwsInstanceCreate(d *schema.ResourceData, meta interface{}) error {
if err != nil {
return fmt.Errorf(
"Error waiting for instance (%s) to become ready: %s",
*instance.InstanceID, err)
instanceID, err)
}

instance = instanceRaw.(*ec2.Instance)
instance := instanceRaw.(*ec2.Instance)

// Initialize the connection info
if instance.PublicIPAddress != nil {
Expand All @@ -591,6 +715,25 @@ func resourceAwsInstanceCreate(d *schema.ResourceData, meta interface{}) error {
func resourceAwsInstanceRead(d *schema.ResourceData, meta interface{}) error {
conn := meta.(*AWSClient).ec2conn

if reqID := d.Get("spot_request_id").(string); reqID != "" {
resp, err := conn.DescribeSpotInstanceRequests(&ec2.DescribeSpotInstanceRequestsInput{
SpotInstanceRequestIDs: []*string{aws.String(reqID)},
})

if err != nil {
// If the instance was not found, return nil so that we can show
// that the instance is gone.
if ec2err, ok := err.(awserr.Error); ok && ec2err.Code() == "InvalidSpotInstanceRequestID.NotFound" {
d.Set("spot_request_id", "")
return nil
}

// Some other error, report it
return err
}
d.Set("spot_request_status", resp.SpotInstanceRequests[0].Status.Code)
}

resp, err := conn.DescribeInstances(&ec2.DescribeInstancesInput{
InstanceIDs: []*string{aws.String(d.Id())},
})
Expand Down Expand Up @@ -749,13 +892,36 @@ func resourceAwsInstanceUpdate(d *schema.ResourceData, meta interface{}) error {
return resourceAwsInstanceRead(d, meta)
}

// FIXME add test to ensure spot instances are deleted
func resourceAwsInstanceDelete(d *schema.ResourceData, meta interface{}) error {
conn := meta.(*AWSClient).ec2conn

if reqID := d.Get("spot_request_id").(string); reqID != "" {
log.Printf("[INFO] Cancelling spot request: %s", reqID)
output, err := conn.CancelSpotInstanceRequests(&ec2.CancelSpotInstanceRequestsInput{
SpotInstanceRequestIDs: []*string{aws.String(reqID)},
})

if err != nil {
return fmt.Errorf(
"Error cancelling spot reservation (%s): %s",
reqID, err)
}

if output != nil {
cancelled := output.CancelledSpotInstanceRequests
if len(cancelled) != 1 || *cancelled[0].SpotInstanceRequestID != reqID {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little wary of this logic myself. It might cause the stack to fail to delete if the spot instance request wasn't fulfilled correctly (I'm not sure how cancel responds to requests that are already cancelled / in some other bad state)

One option I've considered is to check if the spot instance is actually active, and if it's not, just skip trying to cancel altogether. That seems cleanest / safest to me.

What do you guys think we should do here?

return fmt.Errorf("Error cancelling reservation: %s %s", reqID, len(cancelled))
}
}
d.Set("spot_request_id", "")
}

log.Printf("[INFO] Terminating instance: %s", d.Id())
req := &ec2.TerminateInstancesInput{
InstanceIDs: []*string{aws.String(d.Id())},
}

if _, err := conn.TerminateInstances(req); err != nil {
return fmt.Errorf("Error terminating instance: %s", err)
}
Expand Down Expand Up @@ -784,6 +950,37 @@ func resourceAwsInstanceDelete(d *schema.ResourceData, meta interface{}) error {
return nil
}

// SpotInstanceStateRefreshFunc returns a resource.StateRefreshFunc that is used to watch
// an EC2 spot instance request
func SpotInstanceStateRefreshFunc(conn *ec2.EC2, reqID string) resource.StateRefreshFunc {

return func() (interface{}, string, error) {
resp, err := conn.DescribeSpotInstanceRequests(&ec2.DescribeSpotInstanceRequestsInput{
SpotInstanceRequestIDs: []*string{aws.String(reqID)},
})

if err != nil {
// If the instance was not found, return nil so that we can show
// that the instance is gone.
if ec2err, ok := err.(awserr.Error); ok && ec2err.Code() == "InvalidSpotInstanceRequestID.NotFound" {
resp = nil
} else {
log.Printf("Error on InstanceStateRefresh: %s", err)
return nil, "", err
}
}

if resp == nil || len(resp.SpotInstanceRequests) == 0 {
// Sometimes AWS just has consistency issues and doesn't see
// our instance yet. Return an empty state.
return nil, "", nil
}

req := resp.SpotInstanceRequests[0]
return req, *req.Status.Code, nil
}
}

// InstanceStateRefreshFunc returns a resource.StateRefreshFunc that is used to watch
// an EC2 instance.
func InstanceStateRefreshFunc(conn *ec2.EC2, instanceID string) resource.StateRefreshFunc {
Expand Down