Handle cases with namespace quotas limits sets (#1182)

* feat(resourcequota): Handle cases with namespace quotas limits sets Handle 2 cases: * User did not set quota for applications or set it incorrectly * User tries to scale the application when limits are already exceeded (overuse) * Don't raise RuntimeError if no events in namespace * Add test for _handle_not_ready_pors function * Fix indents required by flake8 * Use replicaset events for handle quota exceed cases * Don't try to wait for pods started if we have failed events in ReplicaSet
deis · Feb 8, 2017 · 9c2d584 · 9c2d584
1 parent 8c8b5ab
commit 9c2d584
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 4 deletions.
diff --git a/rootfs/scheduler/mock.py b/rootfs/scheduler/mock.py
@@ -551,7 +551,7 @@ def filter_data(filters, path):
             continue
 
         # check if item has labels
-        if 'labels' not in item['metadata']:
+        if 'labels' not in item['metadata'] and item['kind'] != 'Event':
             continue
 
         # Do extra filtering based on labelSelector

diff --git a/rootfs/scheduler/resources/deployment.py b/rootfs/scheduler/resources/deployment.py
@@ -237,6 +237,12 @@ def in_progress(self, namespace, name, timeout, batches, replicas, tags):
             self.log(namespace, 'Deploy operation for Deployment {} in has expired. Rolling back to last good known release'.format(name), level='DEBUG')  # noqa
             return False, True
 
+        try:
+            self._check_for_failed_events(namespace, labels=labels)
+        except KubeException as e:
+            self.log(namespace, e)
+            return False, True
+
         return True, False
 
     def are_replicas_ready(self, namespace, name):
@@ -326,6 +332,9 @@ def wait_until_ready(self, namespace, name, **kwargs):
         timeout = len(batches) * deploy_timeout
         self.log(namespace, 'This deployments overall timeout is {}s - batch timout is {}s and there are {} batches to deploy with a total of {} pods'.format(timeout, deploy_timeout, len(batches), replicas))  # noqa
 
+        # check for failed events(when quota exceeded for example)
+        self._check_for_failed_events(namespace, labels=labels)
+
         waited = 0
         while waited < timeout:
             ready, availablePods = self.are_replicas_ready(namespace, name)
@@ -352,6 +361,39 @@ def wait_until_ready(self, namespace, name, **kwargs):
         if not ready:
             self.pod._handle_not_ready_pods(namespace, labels)
 
+    def _check_for_failed_events(self, namespace, labels):
+        """
+        Request for new ReplicaSet of Deployment and search for failed events involved by that RS
+        Raises: KubeException when RS have events with FailedCreate reason
+        """
+        response = self.rs.get(namespace, labels=labels)
+        data = response.json()
+        fields = {
+            'involvedObject.kind': 'ReplicaSet',
+            'involvedObject.name': data['items'][0]['metadata']['name'],
+            'involvedObject.namespace': namespace,
+            'involvedObject.uid': data['items'][0]['metadata']['uid'],
+        }
+        events_list = self.ns.events(namespace, fields=fields).json()
+        events = events_list.get('items', [])
+        if events is not None and len(events) != 0:
+            for event in events:
+                if event['reason'] == 'FailedCreate':
+                    log = self._get_formatted_messages(events)
+                    self.log(namespace, log)
+                    raise KubeException(log)
+
+    @staticmethod
+    def _get_formatted_messages(events):
+        """
+        Format each event by string and join all events to one string
+        """
+        message_format = 'Message:{message}, lastTimestamp:{lastTimestamp}, reason: {reason}, count: {count}'  # noqa
+        output = []
+        for event in events:
+            output.append(message_format.format(**event))
+        return '\n'.join(output)
+
     def _get_deploy_steps(self, batches, tags):
         # if there is no batch information available default to available nodes for app
         if not batches:

diff --git a/rootfs/scheduler/resources/events.py b/rootfs/scheduler/resources/events.py
@@ -0,0 +1,44 @@
+from scheduler.exceptions import KubeHTTPException
+from scheduler.resources import Resource
+from datetime import datetime
+import uuid
+
+DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
+
+
+class Events(Resource):
+    """
+    Events resource.
+    Warning! Used ONLY for testing purposes
+    """
+    short_name = 'ev'
+
+    def create(self, namespace, name, message, **kwargs):
+        url = self.api('/namespaces/{}/events'.format(namespace))
+        data = {
+            'kind': 'Event',
+            'apiVersion': 'v1',
+            'count': kwargs.get('count', 1),
+            'metadata': {
+                'creationTimestamp': datetime.now().strftime(DATETIME_FORMAT),
+                'namespace': namespace,
+                'name': name,
+                'resourceVersion': kwargs.get('resourceVersion', ''),
+                'uid': str(uuid.uuid4()),
+            },
+            'message': message,
+            'type': kwargs.get('type', 'Normal'),
+            'firstTimestamp': datetime.now().strftime(DATETIME_FORMAT),
+            'lastTimestamp': datetime.now().strftime(DATETIME_FORMAT),
+            'reason': kwargs.get('reason', ''),
+            'source': {
+                'component': kwargs.get('component', ''),
+            },
+            'involvedObject': kwargs.get('involvedObject', {})
+        }
+
+        response = self.http_post(url, json=data)
+        if not response.status_code == 201:
+            raise KubeHTTPException(response, 'create Event for namespace {}'.format(namespace))  # noqa
+
+        return response
diff --git a/rootfs/scheduler/resources/pod.py b/rootfs/scheduler/resources/pod.py
@@ -719,8 +719,6 @@ def _handle_not_ready_pods(self, namespace, labels):
                     message = "\n".join([x.strip() for x in event['message'].split("\n")])
                     raise KubeException(message)
 
-        return None
-
     def deploy_probe_timeout(self, timeout, namespace, labels, containers):
         """
         Added in additional timeouts based on readiness and liveness probe

diff --git a/rootfs/scheduler/tests/test_deployments.py b/rootfs/scheduler/tests/test_deployments.py
@@ -3,7 +3,7 @@
 
 Run the tests with './manage.py test scheduler'
 """
-from scheduler import KubeHTTPException
+from scheduler import KubeHTTPException, KubeException
 from scheduler.tests import TestCase
 from scheduler.utils import generate_random_name
 
@@ -240,3 +240,26 @@ def test_get_deployment_replicaset(self):
             data['metadata']['labels'],
             data
         )
+
+    def test_check_for_failed_events(self):
+        deploy_name = self.create(self.namespace)
+        deployment = self.scheduler.deployment.get(self.namespace, deploy_name).json()
+        response = self.scheduler.rs.get(self.namespace, labels=deployment['metadata']['labels'])
+        rs = response.json()
+        involved_object = {
+            'involvedObject.kind': 'ReplicaSet',
+            'involvedObject.name': rs['items'][0]['metadata']['name'],
+            'involvedObject.namespace': self.namespace,
+            'involvedObject.uid': rs['items'][0]['metadata']['uid'],
+        }
+        message = 'Quota exeeded'
+        self.scheduler.ev.create(self.namespace,
+                                 '{}'.format(generate_random_name()),
+                                 message,
+                                 type='Warning',
+                                 involved_object=involved_object,
+                                 reason='FailedCreate')
+        with self.assertRaisesRegex(KubeException,
+                                    'Message:{}.*'.format(message)):
+            self.scheduler.deployment._check_for_failed_events(self.namespace,
+                                                               labels=deployment['metadata']['labels'])  # noqa