diff --git a/agent/org.linkedin.glu.agent-api/src/main/groovy/org/linkedin/glu/agent/api/Agent.groovy b/agent/org.linkedin.glu.agent-api/src/main/groovy/org/linkedin/glu/agent/api/Agent.groovy
index bedd2efd..6a099777 100644
--- a/agent/org.linkedin.glu.agent-api/src/main/groovy/org/linkedin/glu/agent/api/Agent.groovy
+++ b/agent/org.linkedin.glu.agent-api/src/main/groovy/org/linkedin/glu/agent/api/Agent.groovy
@@ -33,6 +33,14 @@ public interface Agent
     running: [[to: 'stopped', action: 'stop']]
   ]
 
+  def static SELF_UPGRADE_TRANSITIONS =
+  [
+      NONE: [[to: 'installed', action: 'install']],
+      installed: [[to: 'NONE', action: 'uninstall'], [to: 'prepared', action: 'prepare']],
+      prepared: [[to: 'upgraded', action: 'commit'], [to: 'installed', action: 'rollback']],
+      upgraded: [[to: 'NONE', action: 'uninstall']]
+  ]
+
   /********************************************************************
    * Software management
    ********************************************************************/
diff --git a/agent/org.linkedin.glu.agent-impl/src/main/groovy/org/linkedin/glu/agent/impl/script/AutoUpgradeScript.groovy b/agent/org.linkedin.glu.agent-impl/src/main/groovy/org/linkedin/glu/agent/impl/script/AutoUpgradeScript.groovy
index 62ba1a38..4fe23d00 100644
--- a/agent/org.linkedin.glu.agent-impl/src/main/groovy/org/linkedin/glu/agent/impl/script/AutoUpgradeScript.groovy
+++ b/agent/org.linkedin.glu.agent-impl/src/main/groovy/org/linkedin/glu/agent/impl/script/AutoUpgradeScript.groovy
@@ -18,6 +18,7 @@
 package org.linkedin.glu.agent.impl.script
 
 import org.linkedin.groovy.util.io.fs.FileSystemImpl
+import org.linkedin.glu.agent.api.Agent
 
 /**
  * This is the script that will auto upgrade the agent.
@@ -36,13 +37,7 @@ import org.linkedin.groovy.util.io.fs.FileSystemImpl
  */
 class AutoUpgradeScript
 {
-  def static stateMachine =
-  [
-      NONE: [[to: 'installed', action: 'install']],
-      installed: [[to: 'NONE', action: 'uninstall'], [to: 'prepared', action: 'prepare']],
-      prepared: [[to: 'upgraded', action: 'commit'], [to: 'installed', action: 'rollback']],
-      upgraded: [[to: 'NONE', action: 'uninstall']]
-  ]
+  def static stateMachine = Agent.SELF_UPGRADE_TRANSITIONS
 
   def currentVersion
   File agentRootDir
diff --git a/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/AgentRestClient.groovy b/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/AgentRestClient.groovy
index b1c8566a..7c38c720 100644
--- a/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/AgentRestClient.groovy
+++ b/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/AgentRestClient.groovy
@@ -443,7 +443,7 @@ class AgentRestClient implements Agent
     def representation = extractRepresentation(clientResource, clientResource.responseEntity)
     if(representation instanceof Status)
     {
-      throw new AgentException(representation.toString())
+      handleRecoverableError(representation)
     }
     else
     {
@@ -451,6 +451,14 @@ class AgentRestClient implements Agent
     }
   }
 
+  protected void handleRecoverableError(Status status)
+  {
+    if(status.isRecoverableError())
+      throw new RecoverableAgentException(status)
+    else
+      throw new AgentException(status.toString())
+  }
+
   /**
    * This method will try to rebuild the full stack trace based on the rest exception recursively.
    * Handles the case when the client does not know about an exception
diff --git a/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/RecoverableAgentException.groovy b/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/RecoverableAgentException.groovy
new file mode 100644
index 00000000..2641eb77
--- /dev/null
+++ b/agent/org.linkedin.glu.agent-rest-client/src/main/groovy/org/linkedin/glu/agent/rest/client/RecoverableAgentException.groovy
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2011 Yan Pujante
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.linkedin.glu.agent.rest.client
+
+import org.linkedin.glu.agent.api.AgentException
+import org.restlet.data.Status
+
+/**
+ * @author yan@pongasoft.com */
+public class RecoverableAgentException extends AgentException
+{
+  private static final long serialVersionUID = 1L;
+
+  Status status
+
+  RecoverableAgentException(Status status)
+  {
+    super(status.toString())
+    if(!status.isRecoverableError())
+      throw new IllegalArgumentException("${status} is not a recoverable error!")
+    this.status = status
+  }
+}
\ No newline at end of file
diff --git a/console/org.linkedin.glu.console-webapp/grails-app/controllers/org/linkedin/glu/console/controllers/AgentsController.groovy b/console/org.linkedin.glu.console-webapp/grails-app/controllers/org/linkedin/glu/console/controllers/AgentsController.groovy
index 2fb4fa91..faaa9f7d 100644
--- a/console/org.linkedin.glu.console-webapp/grails-app/controllers/org/linkedin/glu/console/controllers/AgentsController.groovy
+++ b/console/org.linkedin.glu.console-webapp/grails-app/controllers/org/linkedin/glu/console/controllers/AgentsController.groovy
@@ -24,6 +24,7 @@ import org.linkedin.glu.agent.tracker.MountPointInfo
 import org.linkedin.glu.orchestration.engine.fabric.Fabric
 import java.security.AccessControlException
 import org.linkedin.glu.orchestration.engine.agents.NoSuchAgentException
+import org.linkedin.glu.provisioner.plan.api.IStep.Type
 
 /**
  * @author ypujante@linkedin.com
@@ -44,9 +45,7 @@ class AgentsController extends ControllerBase
   def listVersions = {
     def agents = agentsService.getAgentInfos(request.fabric)
 
-    def versions = agents.values().groupBy { agent ->
-      agent.agentProperties['org.linkedin.glu.agent.version']
-    }
+    def versions = agents.values().groupBy { it.version }
 
     return [versions: versions]
   }
@@ -68,37 +67,46 @@ class AgentsController extends ControllerBase
     }
 
     params.fabric = request.fabric
+    params.type = Type.PARALLEL
 
-    def plan = agentsService.createAgentsUpgradePlan(params)
-
-    session.delta = [plan]
+    def plans =
+      deploymentService.computeAgentsUpgradePlan(params,
+                                                 [name: "Agent upgrade to version ${params.version}".toString()])
 
-    redirect(controller: 'plan', action: 'view', id: plan.id)
+    if(plans)
+    {
+      session.delta = plans
+      println plans[0].toXml()
+      redirect(controller: 'plan', action: 'view', id: plans[0].id)
+    }
+    else
+    {
+      flash.message = "No agent to upgrade"
+      redirect(action: 'listVersions')
+    }
   }
 
   /**
    * cleanup
    */
   def cleanup = {
-    if(!params.version)
+    params.name = "Agent upgrade cleanup"
+    params.system = request.system
+    params.type = Type.PARALLEL
+
+    def plans = deploymentService.computeAgentsCleanupUpgradePlan(params, null)
+
+    if(plans)
     {
-      flash.error = "Missing version"
-      redirect(action: 'listVersions')
-      return
+      session.delta = plans
+      println plans[0].toXml()
+      redirect(controller: 'plan', action: 'view', id: plans[0].id)
     }
-
-    if(params.agents instanceof String)
+    else
     {
-      params.agents = [params.agents]
+      flash.message = "No agent to cleanup"
+      redirect(action: 'listVersions')
     }
-
-    params.fabric = request.fabric
-    
-    def plan = agentsService.createAgentsCleanupUpgradePlan(params)
-
-    session.delta = [plan]
-
-    redirect(controller: 'plan', action: 'view', id: plan.id)
   }
 
   /**
@@ -117,9 +125,7 @@ class AgentsController extends ControllerBase
       params.name = title
 
       def system = request.system
-      system = system?.filterBy {
-        it.agent == params.id
-      }
+      system = system?.filterBy("agent='${params.id}'".toString())
 
       request.system = system
       params.system = system
diff --git a/console/org.linkedin.glu.console-webapp/grails-app/views/agents/listVersions.gsp b/console/org.linkedin.glu.console-webapp/grails-app/views/agents/listVersions.gsp
index ec7d07fe..c432acfe 100644
--- a/console/org.linkedin.glu.console-webapp/grails-app/views/agents/listVersions.gsp
+++ b/console/org.linkedin.glu.console-webapp/grails-app/views/agents/listVersions.gsp
@@ -19,7 +19,13 @@
 <head>
   <title>GLU Console - Agents</title>
   <meta name="layout" content="main"/>
-  <script type="text/javascript" src="${resource(dir:'js',file:'console.js')}"/>
+  <script type="text/javascript" src="${resource(dir:'js',file:'console.js')}"></script>
+  <style type="text/css">
+    .separator {
+      border-top: 1px solid black;
+      padding-top: 1.5em;
+    }
+  </style>
 </head>
 <body>
 <h1>Agent Upgrade</h1>
@@ -29,7 +35,6 @@
   <li>Coordinates: <g:textField name="coordinates" size="100"/></li>
   </ul>
   <g:actionSubmit action="upgrade" value="Upgrade"/>
-  <g:actionSubmit action="cleanup" value="Cleanup"/>
 <g:each in="${versions.keySet().sort()}" var="version">
   <h2>${version}</h2>
   <p>Quick Select:
@@ -50,5 +55,10 @@
   </table>
 </g:each>
 </g:form>
+<h2 class="separator">Agent Cleanup Upgrade</h2>
+<g:form method="post" controller="agents" action="upgrade">
+  <g:actionSubmit action="cleanup" value="Cleanup"/>
+  <span class="example">Cleanup all previously leftover upgrade</span>
+</g:form>
 </body>
 </html>
\ No newline at end of file
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/core/action/execution/ActionExecutionFactoryImpl.groovy b/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/core/action/execution/ActionExecutionFactoryImpl.groovy
index f6d1276a..006ca5c5 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/core/action/execution/ActionExecutionFactoryImpl.groovy
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/core/action/execution/ActionExecutionFactoryImpl.groovy
@@ -32,6 +32,8 @@ import org.linkedin.glu.agent.rest.client.EncryptionKeysProvider
 import org.linkedin.util.clock.Timespan
 import org.linkedin.glu.orchestration.engine.action.descriptor.ScriptLifecycleInstallActionDescriptor
 import org.linkedin.glu.orchestration.engine.action.descriptor.ScriptLifecycleUninstallActionDescriptor
+import org.linkedin.util.reflect.ObjectProxyBuilder
+import org.linkedin.glu.orchestration.engine.action.execution.RecoverableAgent
 
 /**
  * This implementation uses a convention:
@@ -57,6 +59,22 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
   @Initializable(required = false)
   Timespan timeout = Timespan.parse('10s')
 
+  /**
+   * when a communication exception is detected with the agent, it will sleep for this time
+   * before trying again */
+  @Initializable(required = false)
+  Timespan agentRecoveryTimeout = Timespan.parse('5s')
+
+  // wait for 5s (default) for the agent to restart
+  @Initializable(required = false)
+  Timespan selfUpgradeWaitForRestartTimeout = Timespan.parse("5s")
+
+  /**
+   * when a communication exception is detected with the agent, it will retry a certain number of
+   * times */
+  @Initializable(required = false)
+  int agentRecoveryNumRetries = 10
+
   /**
    * For NoOpActionDescriptor: do nothing
    */
@@ -82,6 +100,17 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
       Map actionArgs = computeActionArgs(ad)
       agent.executeAction(mountPoint: mountPoint, action: ad.action, actionArgs: actionArgs)
 
+      // // TODO MED YP: this is somewhat hacky but it will do for now
+      if(mountPoint == "/self/upgrade")
+      {
+        if(ad.action == 'prepare' || ad.action == 'rollback')
+        {
+          if(log.isDebugEnabled())
+            log.debug("sleeping before waiting for state for ${ad.action}")
+          Thread.sleep(selfUpgradeWaitForRestartTimeout.durationInMilliseconds)
+        }
+      }
+
       // 3. we wait for the action to be completed
       def success = false
       while(!success)
@@ -104,10 +133,17 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
    */
   def ScriptLifecycleInstallActionDescriptor_execution = { ScriptLifecycleInstallActionDescriptor ad ->
     withAgent(ad) { Agent agent ->
-      agent.installScript(mountPoint: ad.mountPoint,
-                          scriptLocation: ad.script,
-                          parent: ad.parent,
-                          initParameters: ad.initParameters)
+      def args =
+      [
+        mountPoint: ad.mountPoint,
+        parent: ad.parent,
+        initParameters: ad.initParameters
+      ]
+      if(ad.script instanceof Map)
+        args.putAll(ad.script)
+      else
+        args.scriptLocation = ad.script
+      agent.installScript(args)
     }
   }
 
@@ -146,7 +182,11 @@ public class ActionExecutionFactoryImpl implements ActionExecutionFactory
    */
   private def withAgent(AgentActionDescriptor ad, Closure closure)
   {
-    agentFactory.withRemoteAgent(agentURIProvider.getAgentURI(ad.fabric, ad.agent), closure)
+    agentFactory.withRemoteAgent(agentURIProvider.getAgentURI(ad.fabric, ad.agent)) { Agent agent ->
+      def agentProxy = new RecoverableAgent(agent, agentRecoveryNumRetries, agentRecoveryTimeout)
+      agent = ObjectProxyBuilder.createProxy(agentProxy, Agent.class)
+      closure(agent)
+    }
   }
 
   @Override
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentService.groovy b/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentService.groovy
index fc809f39..740337aa 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentService.groovy
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentService.groovy
@@ -29,6 +29,8 @@ import org.linkedin.glu.orchestration.engine.action.descriptor.ActionDescriptor
  * @author ypujante@linkedin.com */
 interface DeploymentService
 {
+  public static final String AGENT_SELF_UPGRADE_MOUNT_POINT = "/self/upgrade"
+
   /**
    * @param params.system the 'expected' system (with filters)
    * @param params.name name of the plan created
@@ -62,6 +64,18 @@ interface DeploymentService
    */
   Collection<Plan<ActionDescriptor>> computeRedeployPlans(params, def metadata)
 
+  /**
+   * Computes the deployment plan for upgrading agents
+   * @param metadata any metadata to add to the plan(s)
+   */
+  Collection<Plan<ActionDescriptor>> computeAgentsUpgradePlan(params, def metadata)
+
+  /**
+   * Computes the deployment plan for cleaning any upgrade that failed
+   * @param metadata any metadata to add to the plan(s)
+   */
+  Collection<Plan<ActionDescriptor>> computeAgentsCleanupUpgradePlan(params, def metadata)
+
   /**
    * Shortcut to group the plan by instance in both sequential and parallel types.
    */
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentServiceImpl.groovy b/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentServiceImpl.groovy
index d1c0e4fc..0f1ffede 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentServiceImpl.groovy
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/groovy/org/linkedin/glu/orchestration/engine/deployment/DeploymentServiceImpl.groovy
@@ -16,7 +16,6 @@
 
 package org.linkedin.glu.orchestration.engine.deployment
 
-import org.linkedin.glu.provisioner.core.environment.Environment
 import org.linkedin.glu.provisioner.core.model.SystemModel
 import org.linkedin.glu.provisioner.impl.agent.DefaultDescriptionProvider
 import org.linkedin.glu.provisioner.plan.api.IPlanExecutionProgressTracker
@@ -37,6 +36,7 @@ import org.linkedin.glu.orchestration.engine.delta.SystemModelDelta
 import org.linkedin.glu.orchestration.engine.planner.Planner
 import org.linkedin.glu.provisioner.plan.api.IStep.Type
 import org.linkedin.glu.orchestration.engine.action.descriptor.ActionDescriptor
+import org.linkedin.glu.provisioner.core.model.SystemEntry
 
 /**
  * System service.
@@ -115,6 +115,25 @@ class DeploymentServiceImpl implements DeploymentService
 
     SystemModel currentModel = agentsService.getCurrentSystemModel(fabric)
 
+    computeDeploymentPlans(params, expectedModel, currentModel, metadata, closure)
+  }
+
+  /**
+   * Compute deployment plans by doing the following:
+   * <ol>
+   *   <li>compute delta between expected model and current model (computed)
+   *   <li>compute the deployment plan(s) (closure callback) (use params.type if a given type only
+   *       is required
+   *   <li>set various metadata on the plan(s) as well as the name
+   * </ol>
+   * @return a collection of plans (<code>null</code> if no expected model) which may be empty
+   */
+  private Collection<Plan<ActionDescriptor>> computeDeploymentPlans(params,
+                                                                    SystemModel expectedModel,
+                                                                    SystemModel currentModel,
+                                                                    def metadata,
+                                                                    Closure closure)
+  {
     // 1. compute delta between expectedModel and currentModel
     SystemModelDelta delta = deltaMgr.computeDelta(expectedModel, currentModel)
 
@@ -185,8 +204,19 @@ class DeploymentServiceImpl implements DeploymentService
    */
   Collection<Plan<ActionDescriptor>> computeBouncePlans(params, def metadata)
   {
+    SystemModel expectedModel = params.system
+
+    if(!expectedModel)
+      return null
+
+    // we filter by entries where the 'expectedState' should be 'running'!
+    expectedModel = expectedModel.filterBy { SystemEntry entry ->
+      entry.entryState == 'running'
+    }
+    params.system = expectedModel
+
     computeDeploymentPlans(params, metadata) { Type type, SystemModelDelta delta ->
-      planner.computeTransitionPlan(type, delta, ['stopped', '<expected>'])
+      planner.computeTransitionPlan(type, delta, ['stopped', 'running'])
     }
   }
 
@@ -212,16 +242,65 @@ class DeploymentServiceImpl implements DeploymentService
     }
   }
 
-  public Plan createPlan(String name,
-                         Environment currentEnvironment,
-                         Environment expectedEnvironment,
-                         Closure closure)
+  /**
+   * Computes the deployment plan for upgrading agents
+   * @param metadata any metadata to add to the plan(s)
+   */
+  @Override
+  Collection<Plan<ActionDescriptor>> computeAgentsUpgradePlan(params, def metadata)
+  {
+    SystemModel currentModel = agentsService.getCurrentSystemModel(params.fabric)
+    def agents = (params.agents ?: []) as Set
+    currentModel = currentModel.filterBy { SystemEntry entry ->
+      agents.contains(entry.agent)
+    }
+
+    // we keep only the agents that are part of the current model!
+    agents = new HashSet()
+    currentModel.each { SystemEntry entry ->
+      agents << entry.agent
+    }
+    currentModel = currentModel.filterBy { SystemEntry entry ->
+      entry.mountPoint == DeploymentService.AGENT_SELF_UPGRADE_MOUNT_POINT
+    }
+
+    SystemModel expectedModel = new SystemModel(fabric: currentModel.fabric)
+    agents.each { String agent ->
+      SystemEntry entry = new SystemEntry(agent: agent,
+                                          mountPoint: DeploymentService.AGENT_SELF_UPGRADE_MOUNT_POINT,
+                                          entryState: 'upgraded')
+      entry.script = [scriptClassName: "org.linkedin.glu.agent.impl.script.AutoUpgradeScript"]
+      entry.initParameters = [
+        newVersion: params.version,
+        agentTar: params.coordinates,
+      ]
+      expectedModel.addEntry(entry)
+    }
+
+    computeDeploymentPlans(params, expectedModel, currentModel, metadata) { Type type, SystemModelDelta delta ->
+      planner.computeTransitionPlan(type, delta, ['<expected>', null])
+    }
+  }
+
+  /**
+   * Computes the deployment plan for cleaning any upgrade that failed
+   * @param metadata any metadata to add to the plan(s)
+   */
+  @Override
+  Collection<Plan<ActionDescriptor>> computeAgentsCleanupUpgradePlan(params, def metadata)
   {
-    deploymentMgr.createPlan(name,
-                             currentEnvironment,
-                             expectedEnvironment,
-                             descriptionProvider,
-                             closure)
+    SystemModel expectedModel = params.system
+
+    if(!expectedModel)
+      return null
+
+    // we filter by entries with only self upgrade mountpoint
+    expectedModel = expectedModel.filterBy { SystemEntry entry ->
+      entry.mountPoint == DeploymentService.AGENT_SELF_UPGRADE_MOUNT_POINT
+    }
+    params.system = expectedModel
+
+    computeDeployPlans(params, metadata)
   }
 
   /**
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/action/execution/RecoverableAgent.java b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/action/execution/RecoverableAgent.java
new file mode 100644
index 00000000..93a1a8c3
--- /dev/null
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/action/execution/RecoverableAgent.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2011 Yan Pujante
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.linkedin.glu.orchestration.engine.action.execution;
+
+import org.linkedin.glu.agent.api.Agent;
+import org.linkedin.glu.agent.rest.client.RecoverableAgentException;
+import org.linkedin.util.clock.Timespan;
+import org.linkedin.util.reflect.ObjectProxyInvocationHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.reflect.Method;
+
+/**
+ * @author yan@pongasoft.com
+ */
+public class RecoverableAgent extends ObjectProxyInvocationHandler<Agent>
+{
+  public static final String MODULE = RecoverableAgent.class.getName();
+  public static final Logger log = LoggerFactory.getLogger(MODULE);
+
+  private final int _numRetries;
+  private final Timespan _agentRecoveryTimeout;
+
+  /**
+   * Constructor
+   */
+  public RecoverableAgent(Agent agent, int numRetries, Timespan agentRecoveryTimeout)
+  {
+    super(agent);
+    _numRetries = numRetries;
+    _agentRecoveryTimeout = agentRecoveryTimeout;
+  }
+
+  @Override
+  public Object invoke(Object proxy, Method method, Object[] args) throws Throwable
+  {
+    for(int i = 0; i < _numRetries; i++)
+    {
+      try
+      {
+        return super.invoke(proxy, method, args);
+      }
+      catch(RecoverableAgentException e)
+      {
+        log.warn("#" + i + ": detected recoverable error while talking to the agent [ignored]: " + e.getMessage());
+        if(log.isDebugEnabled())
+          log.debug("Detected recoverable error while talking to the agent [ignored]", e);
+
+        Thread.sleep(_agentRecoveryTimeout.getDurationInMilliseconds());
+      }
+    }
+    
+    throw new TooManyRetriesAgentException("too many retries (" + _numRetries + ") for " + method.getName());
+  }
+}
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/action/execution/TooManyRetriesAgentException.java b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/action/execution/TooManyRetriesAgentException.java
new file mode 100644
index 00000000..1a840db3
--- /dev/null
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/action/execution/TooManyRetriesAgentException.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2011 Yan Pujante
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.linkedin.glu.orchestration.engine.action.execution;
+
+import org.linkedin.glu.agent.api.AgentException;
+
+/**
+ * @author yan@pongasoft.com
+ */
+public class TooManyRetriesAgentException extends AgentException
+{
+  private static final long serialVersionUID = 1L;
+
+  public TooManyRetriesAgentException()
+  {
+  }
+
+  public TooManyRetriesAgentException(String s)
+  {
+    super(s);
+  }
+}
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/delta/impl/SystemEntryDeltaImpl.java b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/delta/impl/SystemEntryDeltaImpl.java
index 49f7f5f3..e57f87cd 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/delta/impl/SystemEntryDeltaImpl.java
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/delta/impl/SystemEntryDeltaImpl.java
@@ -21,6 +21,7 @@
 import org.linkedin.glu.orchestration.engine.delta.SystemEntryValue;
 import org.linkedin.glu.orchestration.engine.delta.SystemEntryValueNoDelta;
 import org.linkedin.glu.orchestration.engine.delta.SystemEntryValueWithDelta;
+import org.linkedin.glu.orchestration.engine.deployment.DeploymentService;
 import org.linkedin.glu.provisioner.core.model.SystemEntry;
 import org.linkedin.groovy.util.state.StateMachine;
 import org.linkedin.groovy.util.state.StateMachineImpl;
@@ -39,12 +40,18 @@
 public class SystemEntryDeltaImpl implements InternalSystemEntryDelta
 {
   public static final StateMachine DEFAULT_STATE_MACHINE;
+  public static final StateMachine SELF_UPGRADE_STATE_MACHINE;
+
 
   static
   {
     Map<String, Object> args = new HashMap<String, Object>();
     args.put("transitions", Agent.DEFAULT_TRANSITIONS);
     DEFAULT_STATE_MACHINE = new StateMachineImpl(args);
+
+    args = new HashMap<String, Object>();
+    args.put("transitions", Agent.SELF_UPGRADE_TRANSITIONS);
+    SELF_UPGRADE_STATE_MACHINE = new StateMachineImpl(args);
   }
 
   private final SystemEntry _expectedEntry;
@@ -403,6 +410,12 @@ public boolean isEmptyAgent()
   @Override
   public StateMachine getStateMachine()
   {
-    return DEFAULT_STATE_MACHINE;
+    String mountPoint = getMountPoint();
+    if(mountPoint == null)
+      return null;
+    else
+      return mountPoint.startsWith(DeploymentService.AGENT_SELF_UPGRADE_MOUNT_POINT) ?
+        SELF_UPGRADE_STATE_MACHINE :
+        DEFAULT_STATE_MACHINE;
   }
 }
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/DescriptionProvider.java b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/DescriptionProvider.java
new file mode 100644
index 00000000..6e7540fc
--- /dev/null
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/DescriptionProvider.java
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2011 Yan Pujante
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.linkedin.glu.orchestration.engine.planner.impl;
+
+import org.linkedin.glu.orchestration.engine.delta.impl.InternalSystemEntryDelta;
+import org.linkedin.glu.orchestration.engine.delta.impl.InternalSystemModelDelta;
+
+/**
+ * @author yan@pongasoft.com
+ */
+public interface DescriptionProvider
+{
+  String computeDescription(InternalSystemModelDelta modelDelta,
+                            InternalSystemEntryDelta entryDelta,
+                            Transition transition);
+}
\ No newline at end of file
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/PlannerImpl.java b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/PlannerImpl.java
index 0bdad5e2..93f672a3 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/PlannerImpl.java
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/PlannerImpl.java
@@ -70,11 +70,11 @@ public Plan<ActionDescriptor> computeDeploymentPlan(IStep.Type type,
     if(systemModelDelta == null)
       return null;
 
-    Transitions transitions = new Transitions((InternalSystemModelDelta) systemModelDelta);
+    TransitionPlan transitionPlan = new TransitionPlan((InternalSystemModelDelta) systemModelDelta);
 
-    transitions.computeTransitionsToFixDelta();
+    transitionPlan.computeTransitionsToFixDelta();
 
-    return transitions.buildPlan(type);
+    return transitionPlan.buildPlan(type);
   }
 
   @Override
@@ -85,11 +85,11 @@ public Plan<ActionDescriptor> computeTransitionPlan(IStep.Type type,
     if(systemModelDelta == null)
       return null;
 
-    Transitions transitions = new Transitions((InternalSystemModelDelta) systemModelDelta);
+    TransitionPlan transitionPlan = new TransitionPlan((InternalSystemModelDelta) systemModelDelta);
 
-    transitions.computeTransitions(toStates);
+    transitionPlan.computeTransitions(toStates);
 
-    return transitions.buildPlan(type);
+    return transitionPlan.buildPlan(type);
   }
 
   // TODO HIGH YP:  add no step handling
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/Transitions.java b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/TransitionPlan.java
similarity index 99%
rename from orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/Transitions.java
rename to orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/TransitionPlan.java
index e21e1e31..78b77b0c 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/Transitions.java
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/main/java/org/linkedin/glu/orchestration/engine/planner/impl/TransitionPlan.java
@@ -41,7 +41,7 @@
 /**
  * @author yan@pongasoft.com
  */
-public class Transitions
+public class TransitionPlan
 {
   public enum ActionFromStatus
   {
@@ -62,7 +62,7 @@ public enum ActionFromStatus
   /**
    * Constructor
    */
-  public Transitions(InternalSystemModelDelta systemModelDelta)
+  public TransitionPlan(InternalSystemModelDelta systemModelDelta)
   {
     _systemModelDelta = systemModelDelta;
   }
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/deployment/TestDeploymentService.groovy b/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/deployment/TestDeploymentService.groovy
new file mode 100644
index 00000000..449ae17d
--- /dev/null
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/deployment/TestDeploymentService.groovy
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2011 Yan Pujante
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package test.orchestration.engine.deployment
+
+import org.linkedin.glu.orchestration.engine.deployment.DeploymentServiceImpl
+import org.linkedin.glu.orchestration.engine.delta.impl.DeltaMgrImpl
+import org.linkedin.glu.orchestration.engine.planner.impl.PlannerImpl
+import org.linkedin.glu.provisioner.core.model.SystemModel
+import org.linkedin.glu.provisioner.plan.api.IStep.Type
+import org.linkedin.glu.provisioner.plan.api.Plan
+import org.linkedin.glu.orchestration.engine.action.descriptor.ActionDescriptor
+import org.linkedin.glu.provisioner.core.model.SystemEntry
+import org.linkedin.glu.orchestration.engine.fabric.FabricService
+import org.linkedin.glu.orchestration.engine.fabric.Fabric
+import org.linkedin.glu.orchestration.engine.agents.AgentsService
+
+/**
+ * @author yan@pongasoft.com */
+public class TestDeploymentService extends GroovyTestCase
+{
+  PlannerImpl planner = new PlannerImpl()
+  DeltaMgrImpl deltaMgr = new DeltaMgrImpl()
+
+  FabricService fabricService = [
+    findFabric: { String fabricName -> new Fabric(name: fabricName)}
+  ] as FabricService
+
+  Map<String, SystemModel> currentModels = [:]
+  AgentsService agentService = [
+    getCurrentSystemModel: { Fabric fabric -> currentModels[fabric.name] }
+  ] as AgentsService
+
+  DeploymentServiceImpl deploymentService= new DeploymentServiceImpl(planner: planner,
+                                                                     deltaMgr: deltaMgr,
+                                                                     fabricService: fabricService,
+                                                                     agentsService: agentService)
+  /**
+   * No agent up upgrade
+   */
+  public void testAgentSelfUpgradeNoAgent()
+  {
+    SystemModel currentSystemModel = m()
+
+    Plan<ActionDescriptor> plan = upgradePlan(currentSystemModel,
+                                              ['a1', 'a2', 'a3'],
+                                              Type.PARALLEL)
+
+    // no agent to upgrade! => no plan
+    assertNull(plan)
+  }
+
+  /**
+   * a1 is empty agent, a2 has an entry, a3 has already a self upgrade entry
+   */
+  public void testAgentSelfUpgrade()
+  {
+    SystemModel currentSystemModel =
+      m([agent: 'a1', metadata: [emptyAgent: true, currentState: 'NA']],
+        [agent: 'a2', mountPoint: '/m1', script: 's1'],
+        [agent: 'a3', mountPoint: '/self/upgrade',
+         script: [scriptClassName: "org.linkedin.glu.agent.impl.script.AutoUpgradeScript"],
+         initParameters: [newVersion: 'v0', agentTar: 'tar0'],
+         entryState: 'prepared'])
+
+    Plan<ActionDescriptor> plan = upgradePlan(currentSystemModel,
+                                              ['a1', 'a2', 'a3'],
+                                              Type.PARALLEL)
+
+    // TODO HIGH YP:  the plan generated is incorrect due to the 'bug' with transitions
+    println plan.toXml()
+  }
+
+  /**
+   * Nothing to cleanup
+   */
+  public void testAgentCleanupSelfUpgradeNoAgent()
+  {
+    SystemModel expectedModel = m()
+    SystemModel currentModel = m()
+
+    Plan<ActionDescriptor> plan = cleanupPlan(expectedModel, currentModel, Type.PARALLEL)
+
+    assertNull(plan)
+  }
+
+  /**
+   * 1 agent in 'prepared' state, other in 'upgraded' state
+   */
+  public void testAgentCleanupSelfUpgrade()
+  {
+    SystemModel expectedModel = m()
+    SystemModel currentModel =
+    m([agent: 'a1', metadata: [emptyAgent: true, currentState: 'NA']],
+      [agent: 'a2', mountPoint: '/m1', script: 's1'],
+      [agent: 'a3', mountPoint: '/self/upgrade',
+       script: [scriptClassName: "org.linkedin.glu.agent.impl.script.AutoUpgradeScript"],
+       initParameters: [newVersion: 'v0', agentTar: 'tar0'],
+       entryState: 'prepared'],
+      [agent: 'a4', mountPoint: '/self/upgrade',
+      script: [scriptClassName: "org.linkedin.glu.agent.impl.script.AutoUpgradeScript"],
+      initParameters: [newVersion: 'v0', agentTar: 'tar0'],
+      entryState: 'upgraded'])
+
+    Plan<ActionDescriptor> p = cleanupPlan(expectedModel, currentModel, Type.PARALLEL)
+
+    assertEquals("""<?xml version="1.0"?>
+<plan fabric="f1" name=" - PARALLEL">
+  <parallel>
+    <sequential agent="a3" mountPoint="/self/upgrade">
+      <leaf agent="a3" fabric="f1" mountPoint="/self/upgrade" name="TODO script action: rollback" scriptTransition="rollback" />
+      <leaf agent="a3" fabric="f1" mountPoint="/self/upgrade" name="TODO script action: uninstall" scriptTransition="uninstall" />
+      <leaf agent="a3" fabric="f1" mountPoint="/self/upgrade" name="TODO script lifecycle: uninstallScript" scriptLifecycle="uninstallScript" />
+    </sequential>
+    <sequential agent="a4" mountPoint="/self/upgrade">
+      <leaf agent="a4" fabric="f1" mountPoint="/self/upgrade" name="TODO script action: uninstall" scriptTransition="uninstall" />
+      <leaf agent="a4" fabric="f1" mountPoint="/self/upgrade" name="TODO script lifecycle: uninstallScript" scriptLifecycle="uninstallScript" />
+    </sequential>
+  </parallel>
+</plan>
+""", p.toXml())
+    assertEquals(5, p.leafStepsCount)
+  }
+
+  private Plan<ActionDescriptor> upgradePlan(SystemModel currentSystemModel,
+                                             Collection<String> agents,
+                                             Type type)
+  {
+    currentModels[currentSystemModel.fabric] = currentSystemModel
+
+    def params = [
+      version: 'v1',
+      coordinates: 'tar1',
+      type: type,
+      agents: agents,
+      fabric: fabricService.findFabric(currentSystemModel.fabric)
+    ]
+    
+    Collection<Plan<ActionDescriptor>> plans =
+     deploymentService.computeAgentsUpgradePlan(params,
+                                                [name: 'self upgrade'])
+    if(plans.size() == 0)
+      return null;
+
+    return plans[0]
+  }
+
+  private Plan<ActionDescriptor> cleanupPlan(SystemModel expectedSystemModel,
+                                             SystemModel currentSystemModel,
+                                             Type type)
+  {
+    currentModels[currentSystemModel.fabric] = currentSystemModel
+
+    Collection<Plan<ActionDescriptor>> plans =
+      deploymentService.computeAgentsCleanupUpgradePlan([system: expectedSystemModel,
+                                                        type: type],
+                                                        null)
+
+    if(plans.size() == 0)
+      return null;
+
+    return plans[0]
+  }
+
+  private SystemModel m(Map... entries)
+  {
+    SystemModel model = new SystemModel(fabric: "f1")
+
+
+    entries.each {
+      model.addEntry(SystemEntry.fromExternalRepresentation(it))
+    }
+
+    return model
+  }
+}
\ No newline at end of file
diff --git a/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/planner/TestPlannerImpl.groovy b/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/planner/TestPlannerImpl.groovy
index 7d16ab7b..38073dc3 100644
--- a/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/planner/TestPlannerImpl.groovy
+++ b/orchestration/org.linkedin.glu.orchestration-engine/src/test/groovy/test/orchestration/engine/planner/TestPlannerImpl.groovy
@@ -26,7 +26,7 @@ import org.linkedin.glu.orchestration.engine.delta.SystemModelDelta
 import org.linkedin.glu.orchestration.engine.delta.DeltaMgr
 import org.linkedin.glu.orchestration.engine.delta.impl.DeltaMgrImpl
 import org.linkedin.groovy.util.json.JsonUtils
-import org.linkedin.glu.orchestration.engine.planner.impl.Transitions
+import org.linkedin.glu.orchestration.engine.planner.impl.TransitionPlan
 import org.linkedin.glu.orchestration.engine.planner.impl.Transition
 
 /**
@@ -761,7 +761,7 @@ public class TestPlannerImpl extends GroovyTestCase
    * Computes the digraph of the transitions
    * (to render with <code>dot -Tpdf < out of this method</code>)
    */
-  private static String digraph(Transitions transitions)
+  private static String digraph(TransitionPlan transitions)
   {
     String graph = new TreeMap(transitions.transitions).values().collect { Transition t ->
       t.executeBefore.sort().collect { String key ->
@@ -772,14 +772,14 @@ public class TestPlannerImpl extends GroovyTestCase
     "digraph delta {\n${graph}\n}"
   }
 
-  private static String toStringAfter(Transitions transitions)
+  private static String toStringAfter(TransitionPlan transitions)
   {
     JsonUtils.toJSON(new TreeMap(transitions.transitions).values().collect { Transition t ->
       "${t.key} -> ${t.executeAfter.sort()}"
     }).toString(2)
   }
 
-  private static String toStringBefore(Transitions transitions)
+  private static String toStringBefore(TransitionPlan transitions)
   {
     JsonUtils.toJSON(new TreeMap(transitions.transitions).values().collect { Transition t ->
       "${t.key} -> ${t.executeBefore.sort()}"