Skip to content

Commit

Permalink
#63: agent self upgrade now using delta computation
Browse files Browse the repository at this point in the history
* introduced RecoverableAgentProxy to allow some slack in case agent is temporarily down
* agent self upgrade uses the 'normal' delta computation
* agent upgrade cleanup alsu uses 'normal' delta compution
  • Loading branch information
ypujante committed Jun 12, 2011
1 parent d4111e4 commit b1c8875
Show file tree
Hide file tree
Showing 17 changed files with 599 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ public interface Agent
running: [[to: 'stopped', action: 'stop']]
]

def static SELF_UPGRADE_TRANSITIONS =
[
NONE: [[to: 'installed', action: 'install']],
installed: [[to: 'NONE', action: 'uninstall'], [to: 'prepared', action: 'prepare']],
prepared: [[to: 'upgraded', action: 'commit'], [to: 'installed', action: 'rollback']],
upgraded: [[to: 'NONE', action: 'uninstall']]
]

/********************************************************************
* Software management
********************************************************************/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.linkedin.glu.agent.impl.script

import org.linkedin.groovy.util.io.fs.FileSystemImpl
import org.linkedin.glu.agent.api.Agent

/**
* This is the script that will auto upgrade the agent.
Expand All @@ -36,13 +37,7 @@ import org.linkedin.groovy.util.io.fs.FileSystemImpl
*/
class AutoUpgradeScript
{
def static stateMachine =
[
NONE: [[to: 'installed', action: 'install']],
installed: [[to: 'NONE', action: 'uninstall'], [to: 'prepared', action: 'prepare']],
prepared: [[to: 'upgraded', action: 'commit'], [to: 'installed', action: 'rollback']],
upgraded: [[to: 'NONE', action: 'uninstall']]
]
def static stateMachine = Agent.SELF_UPGRADE_TRANSITIONS

def currentVersion
File agentRootDir
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -443,14 +443,22 @@ class AgentRestClient implements Agent
def representation = extractRepresentation(clientResource, clientResource.responseEntity)
if(representation instanceof Status)
{
throw new AgentException(representation.toString())
handleRecoverableError(representation)
}
else
{
throwAgentException(clientResource.status, RestException.fromJSON(representation))
}
}

protected void handleRecoverableError(Status status)
{
if(status.isRecoverableError())
throw new RecoverableAgentException(status)
else
throw new AgentException(status.toString())
}

/**
* This method will try to rebuild the full stack trace based on the rest exception recursively.
* Handles the case when the client does not know about an exception
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2011 Yan Pujante
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package org.linkedin.glu.agent.rest.client

import org.linkedin.glu.agent.api.AgentException
import org.restlet.data.Status

/**
* @author yan@pongasoft.com */
public class RecoverableAgentException extends AgentException
{
private static final long serialVersionUID = 1L;

Status status

RecoverableAgentException(Status status)
{
super(status.toString())
if(!status.isRecoverableError())
throw new IllegalArgumentException("${status} is not a recoverable error!")
this.status = status
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.linkedin.glu.agent.tracker.MountPointInfo
import org.linkedin.glu.orchestration.engine.fabric.Fabric
import java.security.AccessControlException
import org.linkedin.glu.orchestration.engine.agents.NoSuchAgentException
import org.linkedin.glu.provisioner.plan.api.IStep.Type

/**
* @author ypujante@linkedin.com
Expand All @@ -44,9 +45,7 @@ class AgentsController extends ControllerBase
def listVersions = {
def agents = agentsService.getAgentInfos(request.fabric)

def versions = agents.values().groupBy { agent ->
agent.agentProperties['org.linkedin.glu.agent.version']
}
def versions = agents.values().groupBy { it.version }

return [versions: versions]
}
Expand All @@ -68,37 +67,46 @@ class AgentsController extends ControllerBase
}

params.fabric = request.fabric
params.type = Type.PARALLEL

def plan = agentsService.createAgentsUpgradePlan(params)

session.delta = [plan]
def plans =
deploymentService.computeAgentsUpgradePlan(params,
[name: "Agent upgrade to version ${params.version}".toString()])

redirect(controller: 'plan', action: 'view', id: plan.id)
if(plans)
{
session.delta = plans
println plans[0].toXml()
redirect(controller: 'plan', action: 'view', id: plans[0].id)
}
else
{
flash.message = "No agent to upgrade"
redirect(action: 'listVersions')
}
}

/**
* cleanup
*/
def cleanup = {
if(!params.version)
params.name = "Agent upgrade cleanup"
params.system = request.system
params.type = Type.PARALLEL

def plans = deploymentService.computeAgentsCleanupUpgradePlan(params, null)

if(plans)
{
flash.error = "Missing version"
redirect(action: 'listVersions')
return
session.delta = plans
println plans[0].toXml()
redirect(controller: 'plan', action: 'view', id: plans[0].id)
}

if(params.agents instanceof String)
else
{
params.agents = [params.agents]
flash.message = "No agent to cleanup"
redirect(action: 'listVersions')
}

params.fabric = request.fabric

def plan = agentsService.createAgentsCleanupUpgradePlan(params)

session.delta = [plan]

redirect(controller: 'plan', action: 'view', id: plan.id)
}

/**
Expand All @@ -117,9 +125,7 @@ class AgentsController extends ControllerBase
params.name = title

def system = request.system
system = system?.filterBy {
it.agent == params.id
}
system = system?.filterBy("agent='${params.id}'".toString())

request.system = system
params.system = system
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@
<head>
<title>GLU Console - Agents</title>
<meta name="layout" content="main"/>
<script type="text/javascript" src="${resource(dir:'js',file:'console.js')}"/>
<script type="text/javascript" src="${resource(dir:'js',file:'console.js')}"></script>
<style type="text/css">
.separator {
border-top: 1px solid black;
padding-top: 1.5em;
}
</style>
</head>
<body>
<h1>Agent Upgrade</h1>
Expand All @@ -29,7 +35,6 @@
<li>Coordinates: <g:textField name="coordinates" size="100"/></li>
</ul>
<g:actionSubmit action="upgrade" value="Upgrade"/>
<g:actionSubmit action="cleanup" value="Cleanup"/>
<g:each in="${versions.keySet().sort()}" var="version">
<h2>${version}</h2>
<p>Quick Select:
Expand All @@ -50,5 +55,10 @@
</table>
</g:each>
</g:form>
<h2 class="separator">Agent Cleanup Upgrade</h2>
<g:form method="post" controller="agents" action="upgrade">
<g:actionSubmit action="cleanup" value="Cleanup"/>
<span class="example">Cleanup all previously leftover upgrade</span>
</g:form>
</body>
</html>
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import org.linkedin.glu.agent.rest.client.EncryptionKeysProvider
import org.linkedin.util.clock.Timespan
import org.linkedin.glu.orchestration.engine.action.descriptor.ScriptLifecycleInstallActionDescriptor
import org.linkedin.glu.orchestration.engine.action.descriptor.ScriptLifecycleUninstallActionDescriptor
import org.linkedin.util.reflect.ObjectProxyBuilder
import org.linkedin.glu.orchestration.engine.action.execution.RecoverableAgent

/**
* This implementation uses a convention:
Expand All @@ -57,6 +59,22 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
@Initializable(required = false)
Timespan timeout = Timespan.parse('10s')

/**
* when a communication exception is detected with the agent, it will sleep for this time
* before trying again */
@Initializable(required = false)
Timespan agentRecoveryTimeout = Timespan.parse('5s')

// wait for 5s (default) for the agent to restart
@Initializable(required = false)
Timespan selfUpgradeWaitForRestartTimeout = Timespan.parse("5s")

/**
* when a communication exception is detected with the agent, it will retry a certain number of
* times */
@Initializable(required = false)
int agentRecoveryNumRetries = 10

/**
* For NoOpActionDescriptor: do nothing
*/
Expand All @@ -82,6 +100,17 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
Map actionArgs = computeActionArgs(ad)
agent.executeAction(mountPoint: mountPoint, action: ad.action, actionArgs: actionArgs)

// // TODO MED YP: this is somewhat hacky but it will do for now
if(mountPoint == "/self/upgrade")
{
if(ad.action == 'prepare' || ad.action == 'rollback')
{
if(log.isDebugEnabled())
log.debug("sleeping before waiting for state for ${ad.action}")
Thread.sleep(selfUpgradeWaitForRestartTimeout.durationInMilliseconds)
}
}

// 3. we wait for the action to be completed
def success = false
while(!success)
Expand All @@ -104,10 +133,17 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
*/
def ScriptLifecycleInstallActionDescriptor_execution = { ScriptLifecycleInstallActionDescriptor ad ->
withAgent(ad) { Agent agent ->
agent.installScript(mountPoint: ad.mountPoint,
scriptLocation: ad.script,
parent: ad.parent,
initParameters: ad.initParameters)
def args =
[
mountPoint: ad.mountPoint,
parent: ad.parent,
initParameters: ad.initParameters
]
if(ad.script instanceof Map)
args.putAll(ad.script)
else
args.scriptLocation = ad.script
agent.installScript(args)
}
}

Expand Down Expand Up @@ -146,7 +182,11 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
*/
private def withAgent(AgentActionDescriptor ad, Closure closure)
{
agentFactory.withRemoteAgent(agentURIProvider.getAgentURI(ad.fabric, ad.agent), closure)
agentFactory.withRemoteAgent(agentURIProvider.getAgentURI(ad.fabric, ad.agent)) { Agent agent ->
def agentProxy = new RecoverableAgent(agent, agentRecoveryNumRetries, agentRecoveryTimeout)
agent = ObjectProxyBuilder.createProxy(agentProxy, Agent.class)
closure(agent)
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import org.linkedin.glu.orchestration.engine.action.descriptor.ActionDescriptor
* @author ypujante@linkedin.com */
interface DeploymentService
{
public static final String AGENT_SELF_UPGRADE_MOUNT_POINT = "/self/upgrade"

/**
* @param params.system the 'expected' system (with filters)
* @param params.name name of the plan created
Expand Down Expand Up @@ -62,6 +64,18 @@ interface DeploymentService
*/
Collection<Plan<ActionDescriptor>> computeRedeployPlans(params, def metadata)

/**
* Computes the deployment plan for upgrading agents
* @param metadata any metadata to add to the plan(s)
*/
Collection<Plan<ActionDescriptor>> computeAgentsUpgradePlan(params, def metadata)

/**
* Computes the deployment plan for cleaning any upgrade that failed
* @param metadata any metadata to add to the plan(s)
*/
Collection<Plan<ActionDescriptor>> computeAgentsCleanupUpgradePlan(params, def metadata)

/**
* Shortcut to group the plan by instance in both sequential and parallel types.
*/
Expand Down
Loading

0 comments on commit b1c8875

Please sign in to comment.