Refine cache stats: distinguish legit misses from miss errors

cache misses due to failures turn out to be fairly frequent, at least from log. Distinguish legit misses from errors would help us quantify this and take appropriate actions. This PR attaches a cause to the miss. Testing Done: https://travis-ci.org/peiyuwang/pants/builds/93991253 Bugs closed: 2638 Reviewed at https://rbcommons.com/s/twitter/r/3190/
twitter · Dec 2, 2015 · fb30d2d · fb30d2d
1 parent 6285e4f
commit fb30d2d
Show file tree

Hide file tree

Showing 9 changed files with 169 additions and 41 deletions.
diff --git a/src/python/pants/backend/jvm/tasks/jvm_compile/jvm_compile.py b/src/python/pants/backend/jvm/tasks/jvm_compile/jvm_compile.py
@@ -618,7 +618,7 @@ def check_cache(vts):
       """
       if not self.artifact_cache_reads_enabled():
         return False
-      cached_vts, uncached_vts = self.check_artifact_cache([vts])
+      cached_vts, _, _ = self.check_artifact_cache([vts])
       if not cached_vts:
         self.context.log.debug('Missed cache during double check for {}'
                                .format(vts.target.address.spec))

diff --git a/src/python/pants/cache/restful_artifact_cache.py b/src/python/pants/cache/restful_artifact_cache.py
@@ -91,6 +91,7 @@ def use_cached_files(self, cache_key, results_dir=None):
         return self._localcache.store_and_use_artifact(cache_key, byte_iter, results_dir)
     except Exception as e:
       logger.warn('\nError while reading from remote artifact cache: {0}\n'.format(e))
+      # TODO(peiyu): clean up partially downloaded local file if any
       return UnreadableArtifact(cache_key, e)
 
     return False

diff --git a/src/python/pants/goal/BUILD b/src/python/pants/goal/BUILD
@@ -17,7 +17,7 @@ python_library(
   ]
 )
 
-# why is this in goal?
+# this is in goal because of run_tracker
 python_library(
   name = 'artifact_cache_stats',
   sources = ['artifact_cache_stats.py'],

diff --git a/src/python/pants/goal/artifact_cache_stats.py b/src/python/pants/goal/artifact_cache_stats.py
@@ -8,6 +8,7 @@
 import os
 from collections import defaultdict, namedtuple
 
+from pants.cache.artifact_cache import UnreadableArtifact
 from pants.util.dirutil import safe_mkdir
 
 
@@ -28,11 +29,11 @@ def init_stat():
     safe_mkdir(self._dir)
 
   def add_hits(self, cache_name, targets):
-    self._add_stat(0, cache_name, targets)
+    self._add_stat(0, cache_name, targets, None)
 
-  # any cache misses, whether legit or due to error.
-  def add_misses(self, cache_name, targets):
-    self._add_stat(1, cache_name, targets)
+  # any cache misses, each target is paired with its cause for the miss.
+  def add_misses(self, cache_name, targets, causes):
+    self._add_stat(1, cache_name, targets, causes)
 
   def get_all(self):
     """Returns the cache stats as a list of dicts."""
@@ -48,12 +49,23 @@ def get_all(self):
     return ret
 
   # hit_or_miss is the appropriate index in CacheStat, i.e., 0 for hit, 1 for miss.
-  def _add_stat(self, hit_or_miss, cache_name, targets):
-    for tgt in targets:
-      self.stats_per_cache[cache_name][hit_or_miss].append(tgt.address.reference())
+  def _add_stat(self, hit_or_miss, cache_name, targets, causes):
+    def format_vts(tgt, cause):
+      """Format into (target, cause) tuple."""
+      target_address = tgt.address.reference()
+      if isinstance(cause, UnreadableArtifact):
+        return (target_address, cause.err.message)
+      elif cause == False:
+        return (target_address, 'uncached')
+      else:
+        return (target_address, '')
 
+    causes = causes or [True] * len(targets)
+    target_with_causes = [format_vts(tgt, cause) for tgt, cause in zip(targets, causes)]
+    self.stats_per_cache[cache_name][hit_or_miss].extend(target_with_causes)
+    suffix = 'misses' if hit_or_miss else 'hits'
     if self._dir and os.path.exists(self._dir):  # Check existence in case of a clean-all.
-      suffix = 'misses' if hit_or_miss else 'hits'
       with open(os.path.join(self._dir, '{}.{}'.format(cache_name, suffix)), 'a') as f:
-        f.write('\n'.join([tgt.address.reference() for tgt in targets]))
+        f.write('\n'.join([' '.join(target_with_cause).strip()
+                           for target_with_cause in target_with_causes]))
         f.write('\n')
diff --git a/src/python/pants/reporting/html_reporter.py b/src/python/pants/reporting/html_reporter.py
@@ -285,12 +285,15 @@ def fix_detail_id(e, _id):
 
       msg_elements = []
       for cache_name, stat in artifact_cache_stats.stats_per_cache.items():
+        # TODO consider display causes for hit/miss targets
+        hit_targets = [tgt for tgt, cause in stat.hit_targets]
+        miss_targets = [tgt for tgt, cause in stat.miss_targets]
         msg_elements.extend([
           cache_name + ' artifact cache: ',
           # Explicitly set the detail ids, so their displayed/hidden state survives a refresh.
-          fix_detail_id(items_to_report_element(stat.hit_targets, 'hit'), 'cache-hit-details'),
+          fix_detail_id(items_to_report_element(hit_targets, 'hit'), 'cache-hit-details'),
           ', ',
-          fix_detail_id(items_to_report_element(stat.miss_targets, 'miss'), 'cache-miss-details'),
+          fix_detail_id(items_to_report_element(miss_targets, 'miss'), 'cache-miss-details'),
           '.'
         ])
       if not msg_elements:

diff --git a/src/python/pants/task/task.py b/src/python/pants/task/task.py
@@ -5,14 +5,12 @@
 from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
                         unicode_literals, with_statement)
 
-import itertools
 import os
 import sys
 from abc import abstractmethod
 from contextlib import contextmanager
 from hashlib import sha1
-
-from twitter.common.collections.orderedset import OrderedSet
+from itertools import chain, repeat
 
 from pants.base.exceptions import TaskError
 from pants.base.fingerprint_strategy import TaskIdentityFingerprintStrategy
@@ -349,7 +347,7 @@ def invalidated(self,
 
     if invalidation_check.invalid_vts and self.artifact_cache_reads_enabled():
       with self.context.new_workunit('cache'):
-        cached_vts, uncached_vts = \
+        cached_vts, uncached_vts, uncached_causes = \
           self.check_artifact_cache(self.check_artifact_cache_for(invalidation_check))
       if cached_vts:
         cached_targets = [vt.target for vt in cached_vts]
@@ -360,7 +358,8 @@ def invalidated(self,
       if uncached_vts:
         uncached_targets = [vt.target for vt in uncached_vts]
         self.context.run_tracker.artifact_cache_stats.add_misses(cache_manager.task_name,
-                                                                 uncached_targets)
+                                                                 uncached_targets,
+                                                                 uncached_causes)
         if not silent:
           self._report_targets('No cached artifacts for ', uncached_targets, '.')
       # Now that we've checked the cache, re-partition whatever is still invalid.
@@ -435,8 +434,10 @@ def check_artifact_cache_for(self, invalidation_check):
   def check_artifact_cache(self, vts):
     """Checks the artifact cache for the specified list of VersionedTargetSets.
 
-    Returns a pair (cached, uncached) of VersionedTargets that were
-    satisfied/unsatisfied from the cache.
+    Returns a tuple (cached, uncached, uncached_causes) of VersionedTargets that were
+    satisfied/unsatisfied from the cache. Uncached VTS are also attached with their
+    causes for the miss: `False` indicates a legit miss while `UnreadableArtifact`
+    is due to either local or remote cache failures.
     """
     return self.do_check_artifact_cache(vts)
 
@@ -447,36 +448,38 @@ def do_check_artifact_cache(self, vts, post_process_cached_vts=None):
     satisfied/unsatisfied from the cache.
     """
     if not vts:
-      return [], []
-
-    cached_vts = []
-    uncached_vts = OrderedSet(vts)
+      return [], [], []
 
     read_cache = self._cache_factory.get_read_cache()
     items = [(read_cache, vt.cache_key, vt.results_dir if vt.has_results_dir else None)
              for vt in vts]
 
     res = self.context.subproc_map(call_use_cached_files, items)
 
-    for vt, was_in_cache in zip(vts, res):
-      if was_in_cache:
-        cached_vts.append(vt)
-        uncached_vts.discard(vt)
-      elif isinstance(was_in_cache, UnreadableArtifact):
-        self._cache_key_errors.add(was_in_cache.key)
-
     self._maybe_create_results_dirs(vts)
 
+    cached_vts = []
+    uncached_vts = []
+    uncached_causes = []
+
     # Note that while the input vts may represent multiple targets (for tasks that overrride
     # check_artifact_cache_for), the ones we return must represent single targets.
-    def flatten(vts):
-      return list(itertools.chain.from_iterable([vt.versioned_targets for vt in vts]))
-    all_cached_vts, all_uncached_vts = flatten(cached_vts), flatten(uncached_vts)
+    # Once flattened, cached/uncached vts are in separate lists. Each uncached vts is paired
+    # with why it is missed for stat reporting purpose.
+    for vt, was_in_cache in zip(vts, res):
+      if was_in_cache:
+        cached_vts.extend(vt.versioned_targets)
+      else:
+        uncached_vts.extend(vt.versioned_targets)
+        uncached_causes.extend(repeat(was_in_cache, len(vt.versioned_targets)))
+        if isinstance(was_in_cache, UnreadableArtifact):
+          self._cache_key_errors.update(was_in_cache.key)
+
     if post_process_cached_vts:
-      post_process_cached_vts(all_cached_vts)
-    for vt in all_cached_vts:
+      post_process_cached_vts(cached_vts)
+    for vt in cached_vts:
       vt.update()
-    return all_cached_vts, all_uncached_vts
+    return cached_vts, uncached_vts, uncached_causes
 
   def update_artifact_cache(self, vts_artifactfiles_pairs):
     """Write to the artifact cache, if we're configured to.

diff --git a/tests/python/pants_test/base/context_utils.py b/tests/python/pants_test/base/context_utils.py
@@ -49,7 +49,7 @@ class DummyRunTracker(object):
     class DummyArtifactCacheStats(object):
       def add_hits(self, cache_name, targets): pass
 
-      def add_misses(self, cache_name, targets): pass
+      def add_misses(self, cache_name, targets, causes): pass
 
     artifact_cache_stats = DummyArtifactCacheStats()
 

diff --git a/tests/python/pants_test/goal/BUILD b/tests/python/pants_test/goal/BUILD
@@ -1,9 +1,33 @@
 # Copyright 2015 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
 
+target(
+  name = 'goal',
+  dependencies = [
+    ':artifact_cache_stats',
+    ':other',
+  ]
+)
+
 python_tests(
-  name='goal',
-  sources=globs('*.py'),
+  name='artifact_cache_stats',
+  sources= ['test_artifact_cache_stats.py'],
+  dependencies=[
+    'src/python/pants/cache',
+    'src/python/pants/goal:artifact_cache_stats',
+    'src/python/pants/util:contextutil',
+    'tests/python/pants_test:base_test',
+  ]
+)
+
+python_tests(
+  name='other',
+  sources=[
+    'test_context.py',
+    'test_products.py',
+    'test_run_tracker.py',
+    'test_union_products.py',
+  ],
   dependencies=[
     '3rdparty/python/twitter/commons:twitter.common.collections',
     'src/python/pants/build_graph',
@@ -13,4 +37,4 @@ python_tests(
     'src/python/pants/util:dirutil',
     'tests/python/pants_test:base_test',
   ]
-)
+)
diff --git a/tests/python/pants_test/goal/test_artifact_cache_stats.py b/tests/python/pants_test/goal/test_artifact_cache_stats.py
@@ -0,0 +1,85 @@
+# coding=utf-8
+# Copyright 2015 Pants project contributors (see CONTRIBUTORS.md).
+# Licensed under the Apache License, Version 2.0 (see LICENSE).
+
+from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
+                        unicode_literals, with_statement)
+
+import os
+from contextlib import contextmanager
+
+import requests
+
+from pants.cache.artifact import ArtifactError
+from pants.cache.artifact_cache import UnreadableArtifact
+from pants.goal.artifact_cache_stats import ArtifactCacheStats
+from pants.util.contextutil import temporary_dir
+from pants_test.base_test import BaseTest
+
+
+class ArtifactCacheStatsTest(BaseTest):
+  TEST_CACHE_NAME_1 = 'ZincCompile'
+  TEST_CACHE_NAME_2 = 'Checkstyle_test_checkstyle'
+  TEST_LOCAL_ERROR = UnreadableArtifact('foo', ArtifactError('CRC check failed'))
+  TEST_REMOTE_ERROR = UnreadableArtifact('bar', requests.exceptions.ConnectionError('Read time out'))
+  TEST_SPEC_A = 'src/scala/a'
+  TEST_SPEC_B = 'src/scala/b'
+  TEST_SPEC_C = 'src/java/c'
+
+  def setUp(self):
+    super(ArtifactCacheStatsTest, self).setUp()
+
+    self.target_a = self.make_target(spec=self.TEST_SPEC_A)
+    self.target_b = self.make_target(spec=self.TEST_SPEC_B)
+    self.target_c = self.make_target(spec=self.TEST_SPEC_C)
+
+  def test_add_hits(self):
+    expected_stats = [
+      {
+        'cache_name': self.TEST_CACHE_NAME_2,
+        'num_hits': 0,
+        'num_misses': 1,
+        'hits': [],
+        'misses': [(self.TEST_SPEC_A, self.TEST_LOCAL_ERROR.err.message)]
+      },
+      {
+        'cache_name': self.TEST_CACHE_NAME_1,
+        'num_hits': 1,
+        'num_misses': 1,
+        'hits': [(self.TEST_SPEC_B, '')],
+        'misses': [(self.TEST_SPEC_C, self.TEST_REMOTE_ERROR.err.message)]
+      },
+    ]
+
+    expected_hit_or_miss_files = {
+      '{}.misses'.format(self.TEST_CACHE_NAME_2):
+        '{} {}\n'.format(self.TEST_SPEC_A, self.TEST_LOCAL_ERROR.err.message),
+      '{}.hits'.format(self.TEST_CACHE_NAME_1):
+        '{}\n'.format(self.TEST_SPEC_B),
+      '{}.misses'.format(self.TEST_CACHE_NAME_1):
+        '{} {}\n'.format(self.TEST_SPEC_C, self.TEST_REMOTE_ERROR.err.message),
+    }
+
+    with self.mock_artifact_cache_stats(expected_stats,
+                                        expected_hit_or_miss_files=expected_hit_or_miss_files)\
+        as artifact_cache_stats:
+      artifact_cache_stats.add_hits(self.TEST_CACHE_NAME_1, [self.target_b])
+      artifact_cache_stats.add_misses(self.TEST_CACHE_NAME_1, [self.target_c],
+                                      [self.TEST_REMOTE_ERROR])
+      artifact_cache_stats.add_misses(self.TEST_CACHE_NAME_2, [self.target_a],
+                                      [self.TEST_LOCAL_ERROR])
+
+  @contextmanager
+  def mock_artifact_cache_stats(self,
+                                expected_stats,
+                                expected_hit_or_miss_files=None):
+    with temporary_dir() as tmp_dir:
+      artifact_cache_stats = ArtifactCacheStats(tmp_dir)
+      yield artifact_cache_stats
+      self.assertEquals(expected_stats, artifact_cache_stats.get_all())
+
+      self.assertEquals(sorted(list(expected_hit_or_miss_files.keys())),
+                        sorted(os.listdir(tmp_dir)))
+      for hit_or_miss_file in expected_hit_or_miss_files.keys():
+        with open(os.path.join(tmp_dir, hit_or_miss_file)) as hit_or_miss_saved:
+          self.assertEquals(expected_hit_or_miss_files[hit_or_miss_file], hit_or_miss_saved.read())