Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow different instance type per role #675

Merged
merged 21 commits into from
Mar 1, 2018
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7b35c7c
different instance types per role
whoarethebritons Oct 20, 2017
f137ca5
verify instance type
whoarethebritons Oct 26, 2017
97e2fd6
cleanup instances if we fail to start
whoarethebritons Oct 26, 2017
e70b62a
cleanup keyname and merge functions that are too similar
whoarethebritons Oct 26, 2017
d6d36b8
Merge branch 'master' of https://github.com/AppScale/appscale-tools i…
whoarethebritons Nov 6, 2017
254cbc4
Merge branch 'master' of https://github.com/AppScale/appscale-tools i…
whoarethebritons Feb 7, 2018
ac72c54
Check instance type while adding an autoscaled instance to a scaleset
tmarballi Feb 7, 2018
400e885
Fix unit tests to allow instance types per role
tmarballi Feb 7, 2018
5ce242f
Add instance type parameter to Node Layout methods
tmarballi Feb 7, 2018
1a46bbb
Merge branch 'master' into allow-type-per-role
tmarballi Feb 8, 2018
ba02618
Merge branch 'master' into allow-type-per-role
tmarballi Feb 12, 2018
4a9e843
Make instance type mandatory only at the node/role layout level and n…
tmarballi Feb 14, 2018
8f48ff8
Merge branch 'master' into allow-type-per-role
tmarballi Feb 16, 2018
fa1783d
To only mandate instance type for cloud ASFs
tmarballi Feb 22, 2018
3694d13
Remove the distinction between nodes with disks and without
tmarballi Feb 23, 2018
945d05f
Allow checking of instance types for the nodes being reused after a d…
tmarballi Feb 23, 2018
667a85c
Fix broken unit tests for reattach nodes
tmarballi Feb 24, 2018
9e8aaeb
Merge branch 'master' into allow-type-per-role
tmarballi Feb 26, 2018
87fbf18
Addressing code review comments.
tmarballi Feb 27, 2018
35cb0ae
Clean up after the load balancer nodes in case one of them has a prob…
tmarballi Feb 28, 2018
331c164
Remove the incorrect looping of terminate instances.
tmarballi Mar 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion appscale/tools/agents/azure_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ class AzureAgent(BaseAgent):
PARAM_APP_SECRET,
PARAM_APP_ID,
PARAM_IMAGE_ID,
PARAM_INSTANCE_TYPE,
PARAM_KEYNAME,
PARAM_SUBSCRIBER_ID,
PARAM_TENANT_ID,
Expand Down Expand Up @@ -491,6 +490,7 @@ def add_instances_to_existing_ss(self, count, parameters):
credentials = self.open_connection(parameters)
subscription_id = str(parameters[self.PARAM_SUBSCRIBER_ID])
resource_group = parameters[self.PARAM_RESOURCE_GROUP]
instance_type = parameters[self.PARAM_INSTANCE_TYPE]
compute_client = ComputeManagementClient(credentials, subscription_id)

num_instances_added = 0
Expand All @@ -505,6 +505,9 @@ def add_instances_to_existing_ss(self, count, parameters):
if ss_instance_count >= self.MAX_VMSS_CAPACITY:
continue

if not vmss.sku.name == instance_type:
continue

scaleset = compute_client.virtual_machine_scale_sets.get(
resource_group, vmss.name)
ss_upgrade_policy = scaleset.upgrade_policy
Expand Down
1 change: 0 additions & 1 deletion appscale/tools/agents/ec2_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ class EC2Agent(BaseAgent):
PARAM_CREDENTIALS,
PARAM_GROUP,
PARAM_IMAGE_ID,
PARAM_INSTANCE_TYPE,
PARAM_KEYNAME,
PARAM_SPOT
)
Expand Down
1 change: 0 additions & 1 deletion appscale/tools/agents/gce_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ class GCEAgent(BaseAgent):
REQUIRED_CREDENTIALS = (
PARAM_GROUP,
PARAM_IMAGE_ID,
PARAM_INSTANCE_TYPE,
PARAM_KEYNAME,
PARAM_PROJECT,
PARAM_ZONE
Expand Down
27 changes: 25 additions & 2 deletions appscale/tools/node_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from agents.factory import InfrastructureAgentFactory
from appscale_logger import AppScaleLogger
from custom_exceptions import BadConfigurationException
from local_state import LocalState
from parse_args import ParseArgs


class NodeLayout():
Expand Down Expand Up @@ -134,6 +136,9 @@ def __init__(self, options):
self.replication = options.get('replication')
self.database_type = options.get('table', 'cassandra')
self.add_to_existing = options.get('add_to_existing')
self.default_instance_type = options.get('instance_type')
self.test = options.get('test')
self.force = options.get('force')

if 'login_host' in options and options['login_host'] is not None:
self.login_host = options['login_host']
Expand Down Expand Up @@ -261,6 +266,20 @@ def validate_node_layout(self):
for node, disk in zip(nodes, disks):
node.disk = disk

instance_type = node_set.get('instance_type', self.default_instance_type)

if not instance_type:
self.invalid("Must set a default instance type or specify instance "
"type per role.")
# Check if this is an allowed instance type.
if instance_type in ParseArgs.DISALLOWED_INSTANCE_TYPES and \
not (self.force or self.test):
reason = "the suggested 4GB of RAM"
if 'database' in roles:
reason += " to run Cassandra"
LocalState.confirm_or_abort("The {0} instance type does not have {1}."
"Please consider using a larger instance "
"type.".format(instance_type, reason))
# Assign master.
if 'master' in roles:
self.master = nodes[0]
Expand All @@ -271,6 +290,7 @@ def validate_node_layout(self):
node.add_role(role)
if role == 'login':
node.public_ip = self.login_host or node.public_ip
node.instance_type = instance_type
if not node.is_valid():
self.invalid(",".join(node.errors()))

Expand Down Expand Up @@ -572,7 +592,7 @@ class Node():

DUMMY_INSTANCE_ID = "i-APPSCALE"

def __init__(self, public_ip, cloud, roles=[], disk=None):
def __init__(self, public_ip, cloud, roles=[], disk=None, instance_type=None):
"""Creates a new Node, representing the given id in the specified cloud.


Expand All @@ -589,6 +609,7 @@ def __init__(self, public_ip, cloud, roles=[], disk=None):
self.cloud = cloud
self.roles = roles
self.disk = disk
self.instance_type = instance_type
self.expand_roles()


Expand Down Expand Up @@ -712,7 +733,8 @@ def to_json(self):
'private_ip': self.private_ip,
'instance_id': self.instance_id,
'jobs': self.roles,
'disk': self.disk
'disk': self.disk,
'instance_type' : self.instance_type
}


Expand All @@ -735,3 +757,4 @@ def from_json(self, node_dict):
self.instance_id = node_dict.get('instance_id')
self.roles = node_dict.get('jobs')
self.disk = node_dict.get('disk')
self.instance_type = node_dict.get('instance_type')
11 changes: 3 additions & 8 deletions appscale/tools/parse_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,16 +649,11 @@ def validate_infrastructure_flags(self):
raise BadConfigurationException("--disks must be a dict, but was a " \
"{0}".format(type(self.args.disks)))

if not self.args.instance_type:
raise BadConfigurationException("Cannot start a cloud instance without " \
"the instance type.")

if self.args.instance_type in self.DISALLOWED_INSTANCE_TYPES and \
not (self.args.force or self.args.test):
LocalState.confirm_or_abort("The {0} instance type does not have " \
"enough RAM to run Cassandra in a production setting. Please " \
"consider using a larger instance type.".format(
self.args.instance_type))
LocalState.confirm_or_abort("The {0} instance type does not have "
"the suggested 4GB of RAM. Please consider using a larger instance "
"type.".format(self.args.instance_type))

if self.args.infrastructure == 'azure':
if not self.args.azure_subscription_id:
Expand Down
150 changes: 102 additions & 48 deletions appscale/tools/remote_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import uuid
import yaml

from boto.exception import BotoServerError

# AppScale-specific imports
from agents.factory import InfrastructureAgentFactory
Expand All @@ -24,6 +25,7 @@
from custom_exceptions import BadConfigurationException
from custom_exceptions import ShellException
from custom_exceptions import TimeoutException
from agents.base_agent import AgentRuntimeException
from agents.gce_agent import CredentialTypes
from agents.gce_agent import GCEAgent
from local_state import APPSCALE_VERSION
Expand Down Expand Up @@ -151,31 +153,68 @@ def start_all_nodes(cls, options, node_layout):

agent.configure_instance_security(params)

load_balancer_nodes = node_layout.get_nodes('load_balancer', True)
instance_ids, public_ips, private_ips = cls.spawn_load_balancers_in_cloud(
options, agent, params,
len(load_balancer_nodes))
load_balancer_roles = {}

for node_index, node in enumerate(load_balancer_nodes):
index = node_layout.nodes.index(node)
node_layout.nodes[index].public_ip = public_ips[node_index]
node_layout.nodes[index].private_ip = private_ips[node_index]
node_layout.nodes[index].instance_id = instance_ids[node_index]
instance_type_roles = {'with_disks':{}, 'without_disks': {}}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm having trouble figuring out the reason for this separation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cdonati azure cannot attach existing disks to scaleset machines so they would have to be created as regular machines, it was added in a little early with the expectation of managed disks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. That sounds like an implementation detail that belongs in the azure agent instead of here.


for node in node_layout.get_nodes('load_balancer', True):
load_balancer_roles.setdefault(node.instance_type, []).append(node)

for node in node_layout.get_nodes('load_balancer', False):
instance_type = instance_type_roles['with_disks'] if node.disk else \
instance_type_roles['without_disks']
instance_type.setdefault(node.instance_type, []).append(node)

spawned_instance_ids = []

for instance_type, load_balancer_nodes in load_balancer_roles.items():
# Copy parameters so we can modify the instance type.
instance_type_params = params.copy()
instance_type_params['instance_type'] = instance_type

instance_ids, public_ips, private_ips = cls.spawn_nodes_in_cloud(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to clean up after itself if it encounters an exception?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it does, It left my mind that we could technically have different instance types for LBs and it would need clean up if the 2nd LB faced a problem after the 1st one was started successfully. I have added the same try except as for non load balancer nodes.

options, agent, instance_type_params, spawned_instance_ids,
count=len(load_balancer_nodes), load_balancer=True)

# Keep track of instances we have started.
spawned_instance_ids.extend(instance_ids)

for node_index, node in enumerate(load_balancer_nodes):
index = node_layout.nodes.index(node)
node_layout.nodes[index].public_ip = public_ips[node_index]
node_layout.nodes[index].private_ip = private_ips[node_index]
node_layout.nodes[index].instance_id = instance_ids[node_index]

if options.static_ip:
node = node_layout.head_node()
agent.associate_static_ip(params, node.instance_id,
options.static_ip)
node.public_ip = options.static_ip
AppScaleLogger.log("Static IP associated with head node.")

AppScaleLogger.log("\nPlease wait for AppScale to prepare your machines "
"for use. This can take few minutes.")

other_nodes = node_layout.get_nodes('load_balancer', False)
if len(other_nodes) > 0:
_instance_ids, _public_ips, _private_ips = cls.spawn_other_nodes_in_cloud(
agent, params,
len(other_nodes))
for _, nodes in instance_type_roles.items():
for instance_type, other_nodes in nodes.items():
if len(other_nodes) <= 0:
break
# Copy parameters so we can modify the instance type.
instance_type_params = params.copy()
instance_type_params['instance_type'] = instance_type

for node_index, node in enumerate(other_nodes):
index = node_layout.nodes.index(node)
node_layout.nodes[index].public_ip = _public_ips[node_index]
node_layout.nodes[index].private_ip = _private_ips[node_index]
node_layout.nodes[index].instance_id = _instance_ids[node_index]
_instance_ids, _public_ips, _private_ips =\
cls.spawn_nodes_in_cloud(options, agent, instance_type_params,
spawned_instance_ids, count=len(other_nodes))

# Keep track of instances we have started.
spawned_instance_ids.extend(_instance_ids)

for node_index, node in enumerate(other_nodes):
index = node_layout.nodes.index(node)
node_layout.nodes[index].public_ip = _public_ips[node_index]
node_layout.nodes[index].private_ip = _private_ips[node_index]
node_layout.nodes[index].instance_id = _instance_ids[node_index]

return node_layout

Expand Down Expand Up @@ -276,7 +315,8 @@ def start_head_node(cls, options, my_id, node_layout):


@classmethod
def spawn_load_balancers_in_cloud(cls, options, agent, params, count=1):
def spawn_nodes_in_cloud(cls, options, agent, params, spawned_instance_ids,
count=1, load_balancer=False):
"""Starts count number of virtual machines in a cloud infrastructure with
public ips.

Expand All @@ -288,40 +328,32 @@ def spawn_load_balancers_in_cloud(cls, options, agent, params, count=1):
agent: The agent to start VMs with, must be passed as an argument
because agents cannot be made twice.
params: The parameters to be sent to the agent.
spawned_instance_ids: Ids of instances that AppScale has started.
count: A int, the number of instances to start.
load_balancer: A boolean indicating whether the spawned instance should
have a public ip or not.
Returns:
The instance ID, public IP address, and private IP address of the machine
that was started.
"""
instance_ids, public_ips, private_ips = agent.run_instances(
count=count, parameters=params, security_configured=True,
public_ip_needed=True)

if options.static_ip:
agent.associate_static_ip(params, instance_ids[0], options.static_ip)
public_ips[0] = options.static_ip
AppScaleLogger.log("Static IP associated with head node.")
return instance_ids, public_ips, private_ips


@classmethod
def spawn_other_nodes_in_cloud(cls, agent, params, count=1):
"""Starts count number of virtual machines in a cloud infrastructure.

This method also prepares the virtual machine for use by the AppScale Tools.
try:
instance_ids, public_ips, private_ips = agent.run_instances(
count=count, parameters=params, security_configured=True,
public_ip_needed=load_balancer)
except (AgentRuntimeException, BotoServerError):
AppScaleLogger.warn("AppScale was unable to start the requested number "
"of instances, attempting to terminate those that "
"were started.")
if len(spawned_instance_ids) > 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the caller do this?

AppScaleLogger.warn("Attempting to terminate those that were started.")
cls.terminate_spawned_instances(spawned_instance_ids, agent, params)

# Cleanup the keyname since it failed.
LocalState.cleanup_keyname(options.keyname)

# Re-raise the original exception.
raise

Args:
agent: The agent to start VMs with, must be passed as an argument
because agents cannot be made twice.
params: The parameters to be sent to the agent.
count: A int, the number of instances to start.
Returns:
The instance ID, public IP address, and private IP address of the machine
that was started.
"""
instance_ids, public_ips, private_ips = agent.run_instances(
count=count, parameters=params, security_configured=True,
public_ip_needed=False)
return instance_ids, public_ips, private_ips

@classmethod
Expand Down Expand Up @@ -850,6 +882,28 @@ def wait_for_machines_to_finish_loading(cls, host, keyname):
time.sleep(cls.WAIT_TIME)


@classmethod
def terminate_spawned_instances(cls, spawned_instance_ids, agent, params):
""" Shuts down instances specified. For use when AppScale has failed to
start all the instances for the deployment since we do not check or clean
any local files.

Args:
spawned_instance_ids: A list of instance ids we have started that
should be terminated.
agent: The agent to call terminate instance with.
params: Agent parameters.
"""
terminate_params = params.copy()
terminate_params[agent.PARAM_INSTANCE_IDS] = spawned_instance_ids
for _ in range(len(spawned_instance_ids)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this running the terminate call multiple times (based on the number of started machines)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, it does not need a loop because the terminate instances method uses a list of instances to delete anyways.

try:
agent.terminate_instances(params)
except (AgentRuntimeException, BotoServerError):
AppScaleLogger.warn("AppScale failed to terminate instance(s) with "
"id(s): {}".format(spawned_instance_ids))


@classmethod
def terminate_cloud_instance(cls, instance_id, options):
""" Powers off a single instance in the currently AppScale deployment and
Expand Down
Loading