From 4dfb3c43968eddc5612b895700eca7bba326d0a4 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Sun, 1 Mar 2020 16:10:58 +0100
Subject: [PATCH 01/26] Revert "[NFC][Python: black] Reformatted the benchmark
 Python sources using utils/python_format.py."

---
 .flake8                                       |   4 +-
 benchmark/scripts/Benchmark_DTrace.in         |  73 +-
 benchmark/scripts/Benchmark_Driver            | 719 +++++--------
 benchmark/scripts/Benchmark_GuardMalloc.in    |  23 +-
 benchmark/scripts/Benchmark_QuickCheck.in     |  68 +-
 .../scripts/Benchmark_RuntimeLeaksRunner.in   | 100 +-
 benchmark/scripts/build_linux.py              |  50 +-
 benchmark/scripts/build_script_helper.py      |  50 +-
 benchmark/scripts/compare_perf_tests.py       | 506 ++++-----
 benchmark/scripts/create_benchmark.py         |  74 +-
 .../generate_harness/generate_harness.py      |  19 +-
 .../perf_test_driver/perf_test_driver.py      |  78 +-
 benchmark/scripts/run_smoke_bench             | 249 ++---
 benchmark/scripts/test_Benchmark_Driver.py    | 980 +++++++-----------
 benchmark/scripts/test_compare_perf_tests.py  | 842 +++++++--------
 benchmark/scripts/test_utils.py               |  17 +-
 benchmark/utils/convertToJSON.py              |  14 +-
 17 files changed, 1587 insertions(+), 2279 deletions(-)

diff --git a/.flake8 b/.flake8
index 549c9aa559695..39188cae5c115 100644
--- a/.flake8
+++ b/.flake8
@@ -6,12 +6,11 @@ filename =
     ./benchmark/scripts/Benchmark_Driver,
     ./benchmark/scripts/Benchmark_DTrace.in,
     ./benchmark/scripts/Benchmark_GuardMalloc.in,
-    ./benchmark/scripts/Benchmark_QuickCheck.in,
     ./benchmark/scripts/Benchmark_RuntimeLeaksRunner.in,
-    ./benchmark/scripts/run_smoke_bench,
 
     ./docs/scripts/ns-html2rst,
 
+    ./test/Driver/Inputs/fake-toolchain/clang++,
     ./test/Driver/Inputs/fake-toolchain/ld,
 
     ./utils/80+-check,
@@ -31,6 +30,7 @@ filename =
     ./utils/recursive-lipo,
     ./utils/round-trip-syntax-test,
     ./utils/rth,
+    ./utils/run-remote,
     ./utils/run-test,
     ./utils/scale-test,
     ./utils/submit-benchmark-results,
diff --git a/benchmark/scripts/Benchmark_DTrace.in b/benchmark/scripts/Benchmark_DTrace.in
index 300291813b96d..273c538cd650f 100644
--- a/benchmark/scripts/Benchmark_DTrace.in
+++ b/benchmark/scripts/Benchmark_DTrace.in
@@ -19,17 +19,20 @@ import sys
 
 DRIVER_LIBRARY_PATH = "@PATH_TO_DRIVER_LIBRARY@"
 sys.path.append(DRIVER_LIBRARY_PATH)
-DTRACE_PATH = os.path.join(DRIVER_LIBRARY_PATH, "swift_stats.d")
+DTRACE_PATH = os.path.join(DRIVER_LIBRARY_PATH, 'swift_stats.d')
 
 import perf_test_driver  # noqa (E402 module level import not at top of file)
 
 # Regexes for the XFAIL_LIST. Matches against '([Onone|O|Osize],TestName)'
-XFAIL_LIST = []
+XFAIL_LIST = [
+]
 
 
 class DTraceResult(perf_test_driver.Result):
+
     def __init__(self, name, status, output, csv_output):
-        perf_test_driver.Result.__init__(self, name, status, output, XFAIL_LIST)
+        perf_test_driver.Result.__init__(
+            self, name, status, output, XFAIL_LIST)
         self.csv_output = csv_output
 
     def is_failure(self):
@@ -37,38 +40,40 @@ class DTraceResult(perf_test_driver.Result):
 
     @classmethod
     def data_headers(cls):
-        return ["Name", "Result", "Total RR Opts", "Total RR Opts/Iter"]
+        return [
+            'Name', 'Result', 'Total RR Opts', 'Total RR Opts/Iter']
 
     @classmethod
     def data_format(cls, max_test_len):
         non_name_headers = DTraceResult.data_headers()[1:]
-        fmt = ("{:<%d}" % (max_test_len + 5)) + "".join(
-            ["{:<%d}" % (len(h) + 2) for h in non_name_headers]
-        )
+        fmt = ('{:<%d}' % (max_test_len + 5)) + \
+            ''.join(['{:<%d}' % (len(h) + 2) for h in non_name_headers])
         return fmt
 
     @classmethod
     def print_data_header(cls, max_test_len, csv_output):
         headers = cls.data_headers()
         if csv_output:
-            print(",".join(headers))
+            print(','.join(headers))
             return
         print(cls.data_format(max_test_len).format(*headers))
 
     def print_data(self, max_test_len):
         result = [self.get_name(), self.get_result()] + map(str, self.output)
         if self.csv_output:
-            print(",".join(result))
+            print(','.join(result))
             return
 
         print(DTraceResult.data_format(max_test_len).format(*result))
 
 
 class DTraceBenchmarkDriver(perf_test_driver.BenchmarkDriver):
+
     def __init__(self, binary, xfail_list, csv_output):
         perf_test_driver.BenchmarkDriver.__init__(
-            self, binary, xfail_list, enable_parallel=True, opt_levels=["O"]
-        )
+            self, binary, xfail_list,
+            enable_parallel=True,
+            opt_levels=['O'])
         self.csv_output = csv_output
 
     def print_data_header(self, max_test_len):
@@ -78,37 +83,23 @@ class DTraceBenchmarkDriver(perf_test_driver.BenchmarkDriver):
         return {}
 
     def process_input(self, data):
-        test_name = "({}_{})".format(data["opt"], data["test_name"])
+        test_name = '({}_{})'.format(data['opt'], data['test_name'])
         print("Running {}...".format(test_name))
         sys.stdout.flush()
 
         def get_results_with_iters(iters):
             e = os.environ
-            e["SWIFT_DETERMINISTIC_HASHING"] = "1"
-            p = subprocess.Popen(
-                [
-                    "sudo",
-                    "dtrace",
-                    "-s",
-                    DTRACE_PATH,
-                    "-c",
-                    "%s %s %s %s"
-                    % (
-                        data["path"],
-                        data["test_name"],
-                        "--num-iters=%d" % iters,
-                        "--num-samples=2",
-                    ),
-                ],
-                stdout=subprocess.PIPE,
-                stderr=open("/dev/null", "w"),
-                env=e,
-            )
+            e['SWIFT_DETERMINISTIC_HASHING'] = '1'
+            p = subprocess.Popen([
+                'sudo', 'dtrace', '-s', DTRACE_PATH,
+                '-c', '%s %s %s %s' % (data['path'], data['test_name'],
+                                       '--num-iters=%d' % iters,
+                                       '--num-samples=2')
+            ], stdout=subprocess.PIPE, stderr=open('/dev/null', 'w'), env=e)
             results = [x for x in p.communicate()[0].split("\n") if len(x) > 0]
             return [
-                x.split(",")[1] for x in results[results.index("DTRACE RESULTS") + 1 :]
-            ]
-
+                x.split(',')[1] for x in
+                results[results.index('DTRACE RESULTS') + 1:]]
         iter_2_results = get_results_with_iters(2)
         iter_3_results = get_results_with_iters(3)
         iter_5_results = get_results_with_iters(5)
@@ -145,18 +136,16 @@ SWIFT_BIN_DIR = os.path.dirname(os.path.abspath(__file__))
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-filter",
+        '-filter',
         type=str,
         default=None,
-        help="Filter out any test that does not match the given regex",
-    )
+        help='Filter out any test that does not match the given regex')
     parser.add_argument(
-        "--emit-csv",
+        '--emit-csv',
         default=False,
-        action="store_true",
+        action='store_true',
         help="Emit csv output",
-        dest="csv_output",
-    )
+        dest='csv_output')
     return parser.parse_args()
 
 
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 1e84738562bfe..31808852bcf22 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -61,22 +61,23 @@ class BenchmarkDriver(object):
         self.results = {}
         # Set a constant hash seed. Some tests are currently sensitive to
         # fluctuations in the number of hash collisions.
-        os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1"
+        os.environ['SWIFT_DETERMINISTIC_HASHING'] = '1'
 
     def _invoke(self, cmd):
-        return self._subprocess.check_output(cmd, stderr=self._subprocess.STDOUT)
+        return self._subprocess.check_output(
+            cmd, stderr=self._subprocess.STDOUT)
 
     @property
     def test_harness(self):
         """Full path to test harness binary."""
-        suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
+        suffix = (self.args.optimization if hasattr(self.args, 'optimization')
+                  else 'O')
         return os.path.join(self.args.tests, "Benchmark_" + suffix)
 
     def _git(self, cmd):
         """Execute the Git command in the `swift-repo`."""
         return self._invoke(
-            ("git -C {0} ".format(self.args.swift_repo) + cmd).split()
-        ).strip()
+            ('git -C {0} '.format(self.args.swift_repo) + cmd).split()).strip()
 
     @property
     def log_file(self):
@@ -88,28 +89,27 @@ class BenchmarkDriver(object):
             return None
         log_dir = self.args.output_dir
         harness_name = os.path.basename(self.test_harness)
-        suffix = "-" + time.strftime("%Y%m%d%H%M%S", time.localtime())
+        suffix = '-' + time.strftime('%Y%m%d%H%M%S', time.localtime())
         if self.args.swift_repo:
             log_dir = os.path.join(
-                log_dir, self._git("rev-parse --abbrev-ref HEAD")
-            )  # branch
-            suffix += "-" + self._git("rev-parse --short HEAD")  # revision
-        return os.path.join(log_dir, harness_name + suffix + ".log")
+                log_dir, self._git('rev-parse --abbrev-ref HEAD'))  # branch
+            suffix += '-' + self._git('rev-parse --short HEAD')  # revision
+        return os.path.join(log_dir, harness_name + suffix + '.log')
 
     @property
     def _cmd_list_benchmarks(self):
         # Use tab delimiter for easier parsing to override the default comma.
         # (The third 'column' is always comma-separated list of tags in square
         # brackets -- currently unused here.)
-        return [self.test_harness, "--list", "--delim=\t"] + (
-            ["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
-        )
+        return [self.test_harness, '--list', '--delim=\t'] + (
+            ['--skip-tags='] if (self.args.benchmarks or
+                                 self.args.filters) else [])
 
     def _get_tests(self):
         """Return a list of performance tests to run."""
         number_name_pairs = [
-            line.split("\t")[:2]
-            for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1]
+            line.split('\t')[:2] for line in
+            self._invoke(self._cmd_list_benchmarks).split('\n')[1:-1]
         ]
         # unzip list of pairs into 2 lists
         test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
@@ -122,79 +122,55 @@ class BenchmarkDriver(object):
 
     def _tests_matching_patterns(self):
         regexes = [re.compile(pattern) for pattern in self.args.filters]
-        return sorted(
-            list(
-                set(
-                    [
-                        name
-                        for pattern in regexes
-                        for name in self.all_tests
-                        if pattern.match(name)
-                    ]
-                )
-            )
-        )
+        return sorted(list(set([name for pattern in regexes
+                                for name in self.all_tests
+                                if pattern.match(name)])))
 
     def _tests_by_name_or_number(self, test_numbers):
         benchmarks = set(self.args.benchmarks)
         number_to_name = dict(zip(test_numbers, self.all_tests))
-        tests_by_number = [
-            number_to_name[i] for i in benchmarks.intersection(set(test_numbers))
-        ]
-        return sorted(
-            list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
-        )
-
-    def run(
-        self,
-        test=None,
-        num_samples=None,
-        num_iters=None,
-        sample_time=None,
-        verbose=None,
-        measure_memory=False,
-        quantile=None,
-    ):
+        tests_by_number = [number_to_name[i]
+                           for i in benchmarks.intersection(set(test_numbers))]
+        return sorted(list(benchmarks
+                           .intersection(set(self.all_tests))
+                           .union(tests_by_number)))
+
+    def run(self, test=None, num_samples=None, num_iters=None,
+            sample_time=None, verbose=None, measure_memory=False,
+            quantile=None):
         """Execute benchmark and gather results."""
         num_samples = num_samples or 0
         num_iters = num_iters or 0  # automatically determine N to run for 1s
         sample_time = sample_time or 0  # default is 1s
 
         cmd = self._cmd_run(
-            test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile
-        )
+            test, num_samples, num_iters, sample_time,
+            verbose, measure_memory, quantile)
         output = self._invoke(cmd)
         results = self.parser.results_from_string(output)
         return results.items()[0][1] if test else results
 
-    def _cmd_run(
-        self,
-        test,
-        num_samples,
-        num_iters,
-        sample_time,
-        verbose,
-        measure_memory,
-        quantile,
-    ):
+    def _cmd_run(self, test, num_samples, num_iters, sample_time,
+                 verbose, measure_memory, quantile):
         cmd = [self.test_harness]
         if test:
             cmd.append(test)
         else:
-            cmd.extend([self.test_number.get(name, name) for name in self.tests])
+            cmd.extend([self.test_number.get(name, name)
+                        for name in self.tests])
         if num_samples > 0:
-            cmd.append("--num-samples={0}".format(num_samples))
+            cmd.append('--num-samples={0}'.format(num_samples))
         if num_iters > 0:
-            cmd.append("--num-iters={0}".format(num_iters))
+            cmd.append('--num-iters={0}'.format(num_iters))
         if sample_time > 0:
-            cmd.append("--sample-time={0}".format(sample_time))
+            cmd.append('--sample-time={0}'.format(sample_time))
         if verbose:
-            cmd.append("--verbose")
+            cmd.append('--verbose')
         if measure_memory:
-            cmd.append("--memory")
+            cmd.append('--memory')
         if quantile:
-            cmd.append("--quantile={0}".format(quantile))
-            cmd.append("--delta")
+            cmd.append('--quantile={0}'.format(quantile))
+            cmd.append('--delta')
         return cmd
 
     def run_independent_samples(self, test):
@@ -202,18 +178,14 @@ class BenchmarkDriver(object):
 
         Returns the aggregated result of independent benchmark invocations.
         """
-
         def merge_results(a, b):
             a.merge(b)
             return a
 
-        return reduce(
-            merge_results,
-            [
-                self.run(test, measure_memory=True, num_iters=1, quantile=20)
-                for _ in range(self.args.independent_samples)
-            ],
-        )
+        return reduce(merge_results,
+                      [self.run(test, measure_memory=True,
+                                num_iters=1, quantile=20)
+                       for _ in range(self.args.independent_samples)])
 
     def log_results(self, output, log_file=None):
         """Log output to `log_file`.
@@ -224,11 +196,11 @@ class BenchmarkDriver(object):
         dir = os.path.dirname(log_file)
         if not os.path.exists(dir):
             os.makedirs(dir)
-        print("Logging results to: %s" % log_file)
-        with open(log_file, "w") as f:
+        print('Logging results to: %s' % log_file)
+        with open(log_file, 'w') as f:
             f.write(output)
 
-    RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"
+    RESULT = '{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}'
 
     def run_and_log(self, csv_console=True):
         """Run benchmarks and continuously log results to the console.
@@ -240,41 +212,19 @@ class BenchmarkDriver(object):
         format is justified columns.
         """
         format = (
-            (lambda values: ",".join(values))
-            if csv_console
-            else (lambda values: self.RESULT.format(*values))
-        )  # justified columns
+            (lambda values: ','.join(values)) if csv_console else
+            (lambda values: self.RESULT.format(*values)))  # justified columns
 
         def console_log(values):
             print(format(values))
 
         def result_values(r):
-            return map(
-                str,
-                [
-                    r.test_num,
-                    r.name,
-                    r.num_samples,
-                    r.min,
-                    r.samples.q1,
-                    r.median,
-                    r.samples.q3,
-                    r.max,
-                    r.max_rss,
-                ],
-            )
-
-        header = [
-            "#",
-            "TEST",
-            "SAMPLES",
-            "MIN(μs)",
-            "Q1(μs)",
-            "MEDIAN(μs)",
-            "Q3(μs)",
-            "MAX(μs)",
-            "MAX_RSS(B)",
-        ]
+            return map(str, [r.test_num, r.name, r.num_samples, r.min,
+                             r.samples.q1, r.median, r.samples.q3, r.max,
+                             r.max_rss])
+
+        header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
+                  'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
         console_log(header)
         results = [header]
         for test in self.tests:
@@ -282,10 +232,10 @@ class BenchmarkDriver(object):
             console_log(result)
             results.append(result)
 
-        print("\nTotal performance tests executed: {0}".format(len(self.tests)))
-        return (
-            None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
-        )  # csv_log
+        print(
+            '\nTotal performance tests executed: {0}'.format(len(self.tests)))
+        return (None if csv_console else
+                ('\n'.join([','.join(r) for r in results]) + '\n'))  # csv_log
 
     @staticmethod
     def run_benchmarks(args):
@@ -305,31 +255,22 @@ class LoggingReportFormatter(logging.Formatter):
     """
 
     import logging as log
-
-    colors = {
-        log.DEBUG: "9",
-        log.INFO: "2",
-        log.WARNING: "3",
-        log.ERROR: "1",
-        log.CRITICAL: "5",
-    }
+    colors = {log.DEBUG: '9', log.INFO: '2', log.WARNING: '3', log.ERROR: '1',
+              log.CRITICAL: '5'}
 
     def __init__(self, use_color=False):
         """Specify if report should use colors; defaults to False."""
-        super(LoggingReportFormatter, self).__init__("%(message)s")
+        super(LoggingReportFormatter, self).__init__('%(message)s')
         self.use_color = use_color
 
     def format(self, record):
         """Format the log record with level and category."""
         msg = super(LoggingReportFormatter, self).format(record)
-        category = (record.name.split(".")[-1] + ": ") if "." in record.name else ""
-        return (
-            "\033[1;3{0}m{1}{2}\033[1;0m".format(
-                self.colors[record.levelno], category, msg
-            )
-            if self.use_color
-            else "{0} {1}{2}".format(record.levelname, category, msg)
-        )
+        category = ((record.name.split('.')[-1] + ': ') if '.' in record.name
+                    else '')
+        return ('\033[1;3{0}m{1}{2}\033[1;0m'.format(
+            self.colors[record.levelno], category, msg) if self.use_color else
+            '{0} {1}{2}'.format(record.levelname, category, msg))
 
 
 class MarkdownReportHandler(logging.StreamHandler):
@@ -343,34 +284,27 @@ class MarkdownReportHandler(logging.StreamHandler):
         """Initialize the handler and write a Markdown table header."""
         super(MarkdownReportHandler, self).__init__(stream)
         self.setLevel(logging.INFO)
-        self.stream.write("\n✅  | Benchmark Check Report\n---|---")
+        self.stream.write('\n✅  | Benchmark Check Report\n---|---')
         self.stream.flush()
 
-    levels = {
-        logging.WARNING: "\n⚠️",
-        logging.ERROR: "\n⛔️",
-        logging.INFO: " <br><sub> ",
-    }
-    categories = {"naming": "🔤", "runtime": "⏱", "memory": "Ⓜ️"}
+    levels = {logging.WARNING: '\n⚠️', logging.ERROR: '\n⛔️',
+              logging.INFO: ' <br><sub> '}
+    categories = {'naming': '🔤', 'runtime': '⏱', 'memory': 'Ⓜ️'}
     quotes_re = re.compile("'")
 
     def format(self, record):
         msg = super(MarkdownReportHandler, self).format(record)
-        return (
-            self.levels.get(record.levelno, "")
-            + (
-                ""
-                if record.levelno == logging.INFO
-                else self.categories.get(record.name.split(".")[-1], "") + " | "
-            )
-            + self.quotes_re.sub("`", msg)
-        )
+        return (self.levels.get(record.levelno, '') +
+                ('' if record.levelno == logging.INFO else
+                 self.categories.get(record.name.split('.')[-1], '') + ' | ') +
+                self.quotes_re.sub('`', msg))
 
     def emit(self, record):
         msg = self.format(record)
         stream = self.stream
         try:
-            if isinstance(msg, unicode) and getattr(stream, "encoding", None):
+            if (isinstance(msg, unicode) and
+                    getattr(stream, 'encoding', None)):
                 stream.write(msg.encode(stream.encoding))
             else:
                 stream.write(msg)
@@ -379,7 +313,7 @@ class MarkdownReportHandler(logging.StreamHandler):
         self.flush()
 
     def close(self):
-        self.stream.write("\n\n")
+        self.stream.write('\n\n')
         self.stream.flush()
         super(MarkdownReportHandler, self).close()
 
@@ -394,10 +328,10 @@ class BenchmarkDoctor(object):
     consumption).
     """
 
-    log = logging.getLogger("BenchmarkDoctor")
-    log_naming = log.getChild("naming")
-    log_runtime = log.getChild("runtime")
-    log_memory = log.getChild("memory")
+    log = logging.getLogger('BenchmarkDoctor')
+    log_naming = log.getChild('naming')
+    log_runtime = log.getChild('runtime')
+    log_memory = log.getChild('memory')
     log.setLevel(logging.DEBUG)
 
     def __init__(self, args, driver=None):
@@ -409,25 +343,23 @@ class BenchmarkDoctor(object):
         self.driver = driver or BenchmarkDriver(args)
         self.results = {}
 
-        if hasattr(args, "markdown") and args.markdown:
+        if hasattr(args, 'markdown') and args.markdown:
             self.console_handler = MarkdownReportHandler(sys.stdout)
         else:
             self.console_handler = logging.StreamHandler(sys.stdout)
             self.console_handler.setFormatter(
-                LoggingReportFormatter(use_color=sys.stdout.isatty())
-            )
-            self.console_handler.setLevel(
-                logging.DEBUG if args.verbose else logging.INFO
-            )
+                LoggingReportFormatter(use_color=sys.stdout.isatty()))
+            self.console_handler.setLevel(logging.DEBUG if args.verbose else
+                                          logging.INFO)
         self.log.addHandler(self.console_handler)
-        self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
+        self.log.debug('Checking tests: %s', ', '.join(self.driver.tests))
         self.requirements = [
             self._name_matches_benchmark_naming_convention,
             self._name_is_at_most_40_chars_long,
             self._no_setup_overhead,
             self._reasonable_setup_time,
             self._optimized_runtime_in_range,
-            self._constant_memory_use,
+            self._constant_memory_use
         ]
 
     def __del__(self):
@@ -436,122 +368,95 @@ class BenchmarkDoctor(object):
             handler.close()
         self.log.removeHandler(self.console_handler)
 
-    benchmark_naming_convention_re = re.compile(r"[A-Z][a-zA-Z0-9\-.!?]+")
-    camel_humps_re = re.compile(r"[a-z][A-Z]")
+    benchmark_naming_convention_re = re.compile(r'[A-Z][a-zA-Z0-9\-.!?]+')
+    camel_humps_re = re.compile(r'[a-z][A-Z]')
 
     @staticmethod
     def _name_matches_benchmark_naming_convention(measurements):
-        name = measurements["name"]
+        name = measurements['name']
         match = BenchmarkDoctor.benchmark_naming_convention_re.match(name)
-        matched = match.group(0) if match else ""
+        matched = match.group(0) if match else ''
         composite_words = len(BenchmarkDoctor.camel_humps_re.findall(name)) + 1
 
         if name != matched:
             BenchmarkDoctor.log_naming.error(
-                "'%s' name doesn't conform to benchmark naming convention.", name
-            )
-            BenchmarkDoctor.log_naming.info("See http://bit.ly/BenchmarkNaming")
+                "'%s' name doesn't conform to benchmark naming convention.",
+                name)
+            BenchmarkDoctor.log_naming.info(
+                'See http://bit.ly/BenchmarkNaming')
 
         if composite_words > 4:
             BenchmarkDoctor.log_naming.warning(
-                "'%s' name is composed of %d words.", name, composite_words
-            )
+                "'%s' name is composed of %d words.", name, composite_words)
             BenchmarkDoctor.log_naming.info(
                 "Split '%s' name into dot-separated groups and variants. "
-                "See http://bit.ly/BenchmarkNaming",
-                name,
-            )
+                "See http://bit.ly/BenchmarkNaming", name)
 
     @staticmethod
     def _name_is_at_most_40_chars_long(measurements):
-        name = measurements["name"]
+        name = measurements['name']
 
         if len(name) > 40:
             BenchmarkDoctor.log_naming.error(
-                "'%s' name is %d characters long.", name, len(name)
-            )
+                "'%s' name is %d characters long.", name, len(name))
             BenchmarkDoctor.log_naming.info(
-                "Benchmark name should not be longer than 40 characters."
-            )
+                'Benchmark name should not be longer than 40 characters.')
 
     @staticmethod
-    def _select(measurements, num_iters=None, opt_level="O"):
-        prefix = measurements["name"] + " " + opt_level
-        prefix += "" if num_iters is None else (" i" + str(num_iters))
-        return [
-            series for name, series in measurements.items() if name.startswith(prefix)
-        ]
+    def _select(measurements, num_iters=None, opt_level='O'):
+        prefix = measurements['name'] + ' ' + opt_level
+        prefix += '' if num_iters is None else (' i' + str(num_iters))
+        return [series for name, series in measurements.items()
+                if name.startswith(prefix)]
 
     @staticmethod
     def _optimized_runtime_in_range(measurements):
-        name = measurements["name"]
+        name = measurements['name']
         setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
         setup = 0 if ratio < 0.05 else setup
         runtime = min(
-            [
-                (result.samples.min - correction)
-                for i_series in [
-                    BenchmarkDoctor._select(measurements, num_iters=i)
-                    for correction in [(setup / i) for i in [1, 2]]
-                ]
-                for result in i_series
-            ]
-        )
+            [(result.samples.min - correction) for i_series in
+             [BenchmarkDoctor._select(measurements, num_iters=i)
+              for correction in [(setup / i) for i in [1, 2]]
+              ] for result in i_series])
 
         threshold = 1000
         if threshold < runtime:
-            log = (
-                BenchmarkDoctor.log_runtime.warning
-                if runtime < 10000
-                else BenchmarkDoctor.log_runtime.error
-            )
-            caveat = "" if setup == 0 else " (excluding the setup overhead)"
+            log = (BenchmarkDoctor.log_runtime.warning if runtime < 10000 else
+                   BenchmarkDoctor.log_runtime.error)
+            caveat = '' if setup == 0 else ' (excluding the setup overhead)'
             log("'%s' execution took at least %d μs%s.", name, runtime, caveat)
 
             def factor(base):  # suitable divisior that's integer power of base
-                return int(
-                    pow(base, math.ceil(math.log(runtime / float(threshold), base)))
-                )
+                return int(pow(base, math.ceil(
+                    math.log(runtime / float(threshold), base))))
 
             BenchmarkDoctor.log_runtime.info(
                 "Decrease the workload of '%s' by a factor of %d (%d), to be "
-                "less than %d μs.",
-                name,
-                factor(2),
-                factor(10),
-                threshold,
-            )
+                "less than %d μs.", name, factor(2), factor(10), threshold)
 
         threshold = 20
         if runtime < threshold:
-            log = (
-                BenchmarkDoctor.log_runtime.error
-                if runtime == 0
-                else BenchmarkDoctor.log_runtime.warning
-            )
+            log = (BenchmarkDoctor.log_runtime.error if runtime == 0 else
+                   BenchmarkDoctor.log_runtime.warning)
             log("'%s' execution took %d μs.", name, runtime)
 
             BenchmarkDoctor.log_runtime.info(
                 "Ensure the workload of '%s' has a properly measurable size"
                 " (runtime > %d μs) and is not eliminated by the compiler (use"
-                " `blackHole` function if necessary)."
-                if runtime == 0
-                else "Increase the workload of '%s' to be more than %d μs.",
-                name,
-                threshold,
-            )
+                " `blackHole` function if necessary)." if runtime == 0 else
+                "Increase the workload of '%s' to be more than %d μs.",
+                name, threshold)
 
     @staticmethod
     def _setup_overhead(measurements):
         select = BenchmarkDoctor._select
-        ti1, ti2 = [
-            float(min(mins))
-            for mins in [
-                [result.samples.min for result in i_series]
-                for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
-            ]
-        ]
-        setup = int(round(2.0 * (ti1 - ti2))) if ti2 > 20 else 0  # limit of accuracy
+        ti1, ti2 = [float(min(mins)) for mins in
+                    [[result.samples.min for result in i_series]
+                     for i_series in
+                     [select(measurements, num_iters=i) for i in [1, 2]]]]
+        setup = (int(round(2.0 * (ti1 - ti2))) if ti2 > 20  # limit of accuracy
+                 else 0)
         ratio = (setup / ti1) if ti1 > 0 else 0
         return (setup, ratio)
 
@@ -561,63 +466,52 @@ class BenchmarkDoctor(object):
         if ratio > 0.05:
             BenchmarkDoctor.log_runtime.error(
                 "'%s' has setup overhead of %d μs (%.1f%%).",
-                measurements["name"],
-                setup,
-                round((100 * ratio), 1),
-            )
+                measurements['name'], setup, round((100 * ratio), 1))
             BenchmarkDoctor.log_runtime.info(
-                "Move initialization of benchmark data to the `setUpFunction` "
-                "registered in `BenchmarkInfo`."
-            )
+                'Move initialization of benchmark data to the `setUpFunction` '
+                'registered in `BenchmarkInfo`.')
 
     @staticmethod
     def _reasonable_setup_time(measurements):
-        setup = min([result.setup for result in BenchmarkDoctor._select(measurements)])
+        setup = min([result.setup
+                     for result in BenchmarkDoctor._select(measurements)])
         if 200000 < setup:  # 200 ms
             BenchmarkDoctor.log_runtime.error(
-                "'%s' setup took at least %d μs.", measurements["name"], setup
-            )
+                "'%s' setup took at least %d μs.",
+                measurements['name'], setup)
             BenchmarkDoctor.log_runtime.info(
-                "The `setUpFunction` should take no more than 200 ms."
-            )
+                'The `setUpFunction` should take no more than 200 ms.')
 
     @staticmethod
     def _constant_memory_use(measurements):
         select = BenchmarkDoctor._select
         (min_i1, max_i1), (min_i2, max_i2) = [
-            (min(memory_use), max(memory_use))
-            for memory_use in [
-                [r.mem_pages for r in i_series]
-                for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
-            ]
-        ]
+            (min(memory_use), max(memory_use)) for memory_use in
+            [[r.mem_pages for r in i_series] for i_series in
+             [select(measurements, num_iters=i) for i in
+              [1, 2]]]]
         range_i1, range_i2 = max_i1 - min_i1, max_i2 - min_i2
         normal_range = 15  # pages
-        name = measurements["name"]
+        name = measurements['name']
         more_info = False
 
         if abs(min_i1 - min_i2) > max(range_i1, range_i2, normal_range):
             more_info = True
             BenchmarkDoctor.log_memory.error(
                 "'%s' varies the memory footprint of the base "
-                "workload depending on the `num-iters`.",
-                name,
-            )
+                "workload depending on the `num-iters`.", name)
 
         if max(range_i1, range_i2) > normal_range:
             more_info = True
             BenchmarkDoctor.log_memory.warning(
                 "'%s' has very wide range of memory used between "
-                "independent, repeated measurements.",
-                name,
-            )
+                "independent, repeated measurements.", name)
 
         if more_info:
             BenchmarkDoctor.log_memory.info(
                 "'%s' mem_pages [i1, i2]: min=[%d, %d] 𝚫=%d R=[%d, %d]",
                 name,
-                *[min_i1, min_i2, abs(min_i1 - min_i2), range_i1, range_i2]
-            )
+                *[min_i1, min_i2, abs(min_i1 - min_i2), range_i1, range_i2])
 
     @staticmethod
     def _adjusted_1s_samples(runtime):
@@ -636,52 +530,38 @@ class BenchmarkDoctor(object):
 
         Returns a dictionary with benchmark name and `PerformanceTestResult`s.
         """
-        self.log.debug("Calibrating num-samples for {0}:".format(benchmark))
-        r = self.driver.run(
-            benchmark, num_samples=3, num_iters=1, verbose=True
-        )  # calibrate
+        self.log.debug('Calibrating num-samples for {0}:'.format(benchmark))
+        r = self.driver.run(benchmark, num_samples=3, num_iters=1,
+                            verbose=True)  # calibrate
         num_samples = self._adjusted_1s_samples(r.samples.min)
 
         def capped(s):
             return min(s, 200)
-
         run_args = [(capped(num_samples), 1), (capped(num_samples / 2), 2)]
         opts = self.driver.args.optimization
         opts = opts if isinstance(opts, list) else [opts]
         self.log.debug(
-            "Runtime {0} μs yields {1} adjusted samples per second.".format(
-                r.samples.min, num_samples
-            )
-        )
+            'Runtime {0} μs yields {1} adjusted samples per second.'.format(
+                r.samples.min, num_samples))
         self.log.debug(
-            "Measuring {0}, 5 x i1 ({1} samples), 5 x i2 ({2} samples)".format(
-                benchmark, run_args[0][0], run_args[1][0]
-            )
-        )
+            'Measuring {0}, 5 x i1 ({1} samples), 5 x i2 ({2} samples)'.format(
+                benchmark, run_args[0][0], run_args[1][0]))
 
         measurements = dict(
-            [
-                (
-                    "{0} {1} i{2}{3}".format(benchmark, o, i, suffix),
-                    self.driver.run(
-                        benchmark,
-                        num_samples=s,
-                        num_iters=i,
-                        verbose=True,
-                        measure_memory=True,
-                    ),
-                )
-                for o in opts
-                for s, i in run_args
-                for suffix in list("abcde")
-            ]
+            [('{0} {1} i{2}{3}'.format(benchmark, o, i, suffix),
+              self.driver.run(benchmark, num_samples=s, num_iters=i,
+                              verbose=True, measure_memory=True))
+             for o in opts
+             for s, i in run_args
+             for suffix in list('abcde')
+             ]
         )
-        measurements["name"] = benchmark
+        measurements['name'] = benchmark
         return measurements
 
     def analyze(self, benchmark_measurements):
         """Analyze whether benchmark fullfills all requirtements."""
-        self.log.debug("Analyzing %s", benchmark_measurements["name"])
+        self.log.debug('Analyzing %s', benchmark_measurements['name'])
         for rule in self.requirements:
             rule(benchmark_measurements)
 
@@ -702,137 +582,93 @@ class BenchmarkDoctor(object):
 
 def format_name(log_path):
     """Return the filename and directory for a log file."""
-    return "/".join(log_path.split("/")[-2:])
+    return '/'.join(log_path.split('/')[-2:])
 
 
 def compare_logs(compare_script, new_log, old_log, log_dir, opt):
     """Return diff of log files at paths `new_log` and `old_log`."""
-    print("Comparing %s %s ..." % (format_name(old_log), format_name(new_log)))
-    subprocess.call(
-        [
-            compare_script,
-            "--old-file",
-            old_log,
-            "--new-file",
-            new_log,
-            "--format",
-            "markdown",
-            "--output",
-            os.path.join(log_dir, "latest_compare_{0}.md".format(opt)),
-        ]
-    )
+    print('Comparing %s %s ...' % (format_name(old_log), format_name(new_log)))
+    subprocess.call([compare_script, '--old-file', old_log,
+                    '--new-file', new_log, '--format', 'markdown',
+                     '--output', os.path.join(log_dir, 'latest_compare_{0}.md'
+                                              .format(opt))])
 
 
 def compare(args):
     log_dir = args.log_dir
     compare_script = args.compare_script
     baseline_branch = args.baseline_branch
-    current_branch = BenchmarkDriver(args, tests=[""])._git(
-        "rev-parse --abbrev-ref HEAD"
-    )
+    current_branch = \
+        BenchmarkDriver(args, tests=[''])._git('rev-parse --abbrev-ref HEAD')
     current_branch_dir = os.path.join(log_dir, current_branch)
     baseline_branch_dir = os.path.join(log_dir, baseline_branch)
 
-    if current_branch != baseline_branch and not os.path.isdir(baseline_branch_dir):
-        print(
-            (
-                "Unable to find benchmark logs for {baseline_branch} branch. "
-                + "Set a baseline benchmark log by passing --benchmark to "
-                + "build-script while on {baseline_branch} branch."
-            ).format(baseline_branch=baseline_branch)
-        )
+    if current_branch != baseline_branch and \
+       not os.path.isdir(baseline_branch_dir):
+        print(('Unable to find benchmark logs for {baseline_branch} branch. ' +
+               'Set a baseline benchmark log by passing --benchmark to ' +
+               'build-script while on {baseline_branch} branch.')
+              .format(baseline_branch=baseline_branch))
         return 1
 
     recent_logs = {}
     for branch_dir in [current_branch_dir, baseline_branch_dir]:
-        for opt in ["O", "Onone"]:
-            recent_logs[os.path.basename(branch_dir) + "_" + opt] = sorted(
-                glob.glob(os.path.join(branch_dir, "Benchmark_" + opt + "-*.log")),
-                key=os.path.getctime,
-                reverse=True,
-            )
+        for opt in ['O', 'Onone']:
+            recent_logs[os.path.basename(branch_dir) + '_' + opt] = sorted(
+                glob.glob(os.path.join(
+                    branch_dir, 'Benchmark_' + opt + '-*.log')),
+                key=os.path.getctime, reverse=True)
 
     if current_branch == baseline_branch:
-        if (
-            len(recent_logs[baseline_branch + "_O"]) > 1
-            and len(recent_logs[baseline_branch + "_Onone"]) > 1
-        ):
-            compare_logs(
-                compare_script,
-                recent_logs[baseline_branch + "_O"][0],
-                recent_logs[baseline_branch + "_O"][1],
-                log_dir,
-                "O",
-            )
-            compare_logs(
-                compare_script,
-                recent_logs[baseline_branch + "_Onone"][0],
-                recent_logs[baseline_branch + "_Onone"][1],
-                log_dir,
-                "Onone",
-            )
+        if len(recent_logs[baseline_branch + '_O']) > 1 and \
+           len(recent_logs[baseline_branch + '_Onone']) > 1:
+            compare_logs(compare_script,
+                         recent_logs[baseline_branch + '_O'][0],
+                         recent_logs[baseline_branch + '_O'][1],
+                         log_dir, 'O')
+            compare_logs(compare_script,
+                         recent_logs[baseline_branch + '_Onone'][0],
+                         recent_logs[baseline_branch + '_Onone'][1],
+                         log_dir, 'Onone')
         else:
-            print(
-                (
-                    "{baseline_branch}/{baseline_branch} comparison "
-                    + "skipped: no previous {baseline_branch} logs"
-                ).format(baseline_branch=baseline_branch)
-            )
+            print(('{baseline_branch}/{baseline_branch} comparison ' +
+                   'skipped: no previous {baseline_branch} logs')
+                  .format(baseline_branch=baseline_branch))
     else:
         # TODO: Check for outdated baseline branch log
-        if (
-            len(recent_logs[current_branch + "_O"]) == 0
-            or len(recent_logs[current_branch + "_Onone"]) == 0
-        ):
-            print("branch sanity failure: missing branch logs")
+        if len(recent_logs[current_branch + '_O']) == 0 or \
+           len(recent_logs[current_branch + '_Onone']) == 0:
+            print('branch sanity failure: missing branch logs')
             return 1
 
-        if (
-            len(recent_logs[current_branch + "_O"]) == 1
-            or len(recent_logs[current_branch + "_Onone"]) == 1
-        ):
-            print("branch/branch comparison skipped: no previous branch logs")
+        if len(recent_logs[current_branch + '_O']) == 1 or \
+           len(recent_logs[current_branch + '_Onone']) == 1:
+            print('branch/branch comparison skipped: no previous branch logs')
         else:
-            compare_logs(
-                compare_script,
-                recent_logs[current_branch + "_O"][0],
-                recent_logs[current_branch + "_O"][1],
-                log_dir,
-                "O",
-            )
-            compare_logs(
-                compare_script,
-                recent_logs[current_branch + "_Onone"][0],
-                recent_logs[current_branch + "_Onone"][1],
-                log_dir,
-                "Onone",
-            )
-
-        if (
-            len(recent_logs[baseline_branch + "_O"]) == 0
-            or len(recent_logs[baseline_branch + "_Onone"]) == 0
-        ):
-            print(
-                (
-                    "branch/{baseline_branch} failure: no {baseline_branch} " + "logs"
-                ).format(baseline_branch=baseline_branch)
-            )
+            compare_logs(compare_script,
+                         recent_logs[current_branch + '_O'][0],
+                         recent_logs[current_branch + '_O'][1],
+                         log_dir, 'O')
+            compare_logs(compare_script,
+                         recent_logs[current_branch + '_Onone'][0],
+                         recent_logs[current_branch + '_Onone'][1],
+                         log_dir, 'Onone')
+
+        if len(recent_logs[baseline_branch + '_O']) == 0 or \
+           len(recent_logs[baseline_branch + '_Onone']) == 0:
+            print(('branch/{baseline_branch} failure: no {baseline_branch} ' +
+                   'logs')
+                  .format(baseline_branch=baseline_branch))
             return 1
         else:
-            compare_logs(
-                compare_script,
-                recent_logs[current_branch + "_O"][0],
-                recent_logs[baseline_branch + "_O"][0],
-                log_dir,
-                "O",
-            )
-            compare_logs(
-                compare_script,
-                recent_logs[current_branch + "_Onone"][0],
-                recent_logs[baseline_branch + "_Onone"][0],
-                log_dir,
-                "Onone",
-            )
+            compare_logs(compare_script,
+                         recent_logs[current_branch + '_O'][0],
+                         recent_logs[baseline_branch + '_O'][0],
+                         log_dir, 'O')
+            compare_logs(compare_script,
+                         recent_logs[current_branch + '_Onone'][0],
+                         recent_logs[baseline_branch + '_Onone'][0],
+                         log_dir, 'Onone')
 
         # TODO: Fail on large regressions
 
@@ -850,100 +686,79 @@ def positive_int(value):
 def parse_args(args):
     """Parse command line arguments and set default values."""
     parser = argparse.ArgumentParser(
-        epilog="Example: ./Benchmark_Driver run -i 5 -f Prefix -f .*Suffix.*"
+        epilog='Example: ./Benchmark_Driver run -i 5 -f Prefix -f .*Suffix.*'
     )
     subparsers = parser.add_subparsers(
-        title="Swift benchmark driver commands",
-        help="See COMMAND -h for additional arguments",
-        metavar="COMMAND",
-    )
+        title='Swift benchmark driver commands',
+        help='See COMMAND -h for additional arguments', metavar='COMMAND')
 
     shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
     benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
     benchmarks_group.add_argument(
-        "benchmarks",
+        'benchmarks',
         default=[],
-        help="benchmark to run (default: all)",
-        nargs="*",
-        metavar="BENCHMARK",
-    )
+        help='benchmark to run (default: all)', nargs='*', metavar="BENCHMARK")
     benchmarks_group.add_argument(
-        "-f",
-        "--filter",
-        dest="filters",
-        action="append",
-        help="run all tests whose name match regular expression PATTERN, "
-        + "multiple filters are supported",
-        metavar="PATTERN",
-    )
+        '-f', '--filter', dest='filters', action='append',
+        help='run all tests whose name match regular expression PATTERN, ' +
+        'multiple filters are supported', metavar="PATTERN")
     shared_benchmarks_parser.add_argument(
-        "-t",
-        "--tests",
-        help="directory containing Benchmark_O{,none,size} " + "(default: DRIVER_DIR)",
-        default=DRIVER_DIR,
-    )
+        '-t', '--tests',
+        help='directory containing Benchmark_O{,none,size} ' +
+        '(default: DRIVER_DIR)',
+        default=DRIVER_DIR)
     shared_benchmarks_parser.add_argument(
-        "-o",
-        "--optimization",
-        metavar="OPT",
-        choices=["O", "Onone", "Osize"],
-        help="optimization level to use: {O,Onone,Osize}, (default: O)",
-        default="O",
-    )
+        '-o', '--optimization',
+        metavar='OPT',
+        choices=['O', 'Onone', 'Osize'],
+        help='optimization level to use: {O,Onone,Osize}, (default: O)',
+        default='O')
 
     run_parser = subparsers.add_parser(
-        "run",
-        help="Run benchmarks and output results to stdout",
-        parents=[shared_benchmarks_parser],
-    )
+        'run',
+        help='Run benchmarks and output results to stdout',
+        parents=[shared_benchmarks_parser])
     run_parser.add_argument(
-        "-i",
-        "--independent-samples",
-        help="number of times to run each test (default: 1)",
-        type=positive_int,
-        default=1,
-    )
+        '-i', '--independent-samples',
+        help='number of times to run each test (default: 1)',
+        type=positive_int, default=1)
     run_parser.add_argument(
-        "--output-dir", help="log results to directory (default: no logging)"
-    )
+        '--output-dir',
+        help='log results to directory (default: no logging)')
     run_parser.add_argument(
-        "--swift-repo", help="absolute path to the Swift source repository"
-    )
+        '--swift-repo',
+        help='absolute path to the Swift source repository')
     run_parser.set_defaults(func=BenchmarkDriver.run_benchmarks)
 
     check_parser = subparsers.add_parser(
-        "check", help="", parents=[shared_benchmarks_parser]
-    )
+        'check',
+        help='',
+        parents=[shared_benchmarks_parser])
     check_group = check_parser.add_mutually_exclusive_group()
     check_group.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        help="show more details during benchmark analysis",
-    )
+        '-v', '--verbose', action='store_true',
+        help='show more details during benchmark analysis')
     check_group.add_argument(
-        "-md", "--markdown", action="store_true", help="format report as Markdown table"
-    )
+        '-md', '--markdown', action='store_true',
+        help='format report as Markdown table')
     check_parser.set_defaults(func=BenchmarkDoctor.run_check)
 
-    compare_parser = subparsers.add_parser("compare", help="Compare benchmark results")
+    compare_parser = subparsers.add_parser(
+        'compare',
+        help='Compare benchmark results')
     compare_parser.add_argument(
-        "--log-dir", required=True, help="directory containing benchmark logs"
-    )
+        '--log-dir', required=True,
+        help='directory containing benchmark logs')
     compare_parser.add_argument(
-        "--swift-repo",
-        required=True,
-        help="absolute path to the Swift source repository",
-    )
+        '--swift-repo', required=True,
+        help='absolute path to the Swift source repository')
     compare_parser.add_argument(
-        "--compare-script", required=True, help="absolute path to compare script"
-    )
+        '--compare-script', required=True,
+        help='absolute path to compare script')
     compare_parser.add_argument(
-        "--baseline-branch",
-        default="master",
-        help="attempt to compare results to baseline results for specified "
-        "branch (default: master)",
-    )
+        '--baseline-branch', default='master',
+        help='attempt to compare results to baseline results for specified '
+             'branch (default: master)')
     compare_parser.set_defaults(func=compare)
 
     return parser.parse_args(args)
@@ -955,5 +770,5 @@ def main():
     return args.func(args)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     exit(main())
diff --git a/benchmark/scripts/Benchmark_GuardMalloc.in b/benchmark/scripts/Benchmark_GuardMalloc.in
index 872179e1d28de..e7d001d4bfa1d 100644
--- a/benchmark/scripts/Benchmark_GuardMalloc.in
+++ b/benchmark/scripts/Benchmark_GuardMalloc.in
@@ -21,36 +21,37 @@ sys.path.append("@PATH_TO_DRIVER_LIBRARY@")
 import perf_test_driver  # noqa (E402 module level import not at top of file)
 
 # Regexes for the XFAIL_LIST. Matches against '([Onone|O|Osize],TestName)'
-XFAIL_LIST = []
+XFAIL_LIST = [
+]
 
 
 class GuardMallocResult(perf_test_driver.Result):
+
     def __init__(self, name, status):
         perf_test_driver.Result.__init__(self, name, status, "", XFAIL_LIST)
 
 
 class GuardMallocBenchmarkDriver(perf_test_driver.BenchmarkDriver):
+
     def __init__(self, binary, xfail_list):
         perf_test_driver.BenchmarkDriver.__init__(
-            self, binary, xfail_list, enable_parallel=True
-        )
+            self, binary, xfail_list,
+            enable_parallel=True)
         self.new_env = os.environ.copy()
-        self.new_env["DYLD_INSERT_LIBRARIES"] = "/usr/lib/libgmalloc.dylib"
+        self.new_env['DYLD_INSERT_LIBRARIES'] = '/usr/lib/libgmalloc.dylib'
 
     def prepare_input(self, name):
-        return {"env": self.new_env}
+        return {'env': self.new_env}
 
     def process_input(self, data):
-        test_name = "({},{})".format(data["opt"], data["test_name"])
+        test_name = '({},{})'.format(data['opt'], data['test_name'])
         print("Running {}...".format(test_name))
         sys.stdout.flush()
 
         p = subprocess.Popen(
-            [data["path"], data["test_name"], "--num-iters=2"],
-            env=data["env"],
-            stderr=open("/dev/null", "w"),
-            stdout=open("/dev/null", "w"),
-        )
+            [data['path'], data['test_name'], '--num-iters=2'],
+            env=data['env'], stderr=open('/dev/null', 'w'),
+            stdout=open('/dev/null', 'w'))
         status = p.wait()
 
         return GuardMallocResult(test_name, status)
diff --git a/benchmark/scripts/Benchmark_QuickCheck.in b/benchmark/scripts/Benchmark_QuickCheck.in
index a2cc257476240..0599d9eb2c8d5 100644
--- a/benchmark/scripts/Benchmark_QuickCheck.in
+++ b/benchmark/scripts/Benchmark_QuickCheck.in
@@ -12,6 +12,7 @@
 #
 # ===---------------------------------------------------------------------===//
 
+import json
 import os
 import subprocess
 import sys
@@ -22,48 +23,47 @@ import perf_test_driver  # noqa (E402 module level import not at top of file)
 
 # This is a hacked up XFAIL list. It should really be a json file, but it will
 # work for now. Add in the exact name of the pass to XFAIL.
-XFAIL_LIST = []
+XFAIL_LIST = [
+]
 
 
 class QuickCheckResult(perf_test_driver.Result):
+
     def __init__(self, name, success):
-        assert isinstance(success, bool)
+        assert(isinstance(success, bool))
         did_fail = not success
         perf_test_driver.Result.__init__(self, name, did_fail, "", XFAIL_LIST)
 
     def print_data(self, max_test_len):
-        fmt = "{:<%d}{:<10}" % (max_test_len + 5)
+        fmt = '{:<%d}{:<10}' % (max_test_len + 5)
         print(fmt.format(self.get_name(), self.get_result()))
 
 
 class QuickCheckBenchmarkDriver(perf_test_driver.BenchmarkDriver):
+
     def __init__(self, binary, xfail_list, num_iters, opt_levels):
         perf_test_driver.BenchmarkDriver.__init__(
-            self, binary, xfail_list, enable_parallel=True, opt_levels=opt_levels
-        )
+            self, binary, xfail_list,
+            enable_parallel=True,
+            opt_levels=opt_levels)
         self.num_iters = num_iters
 
     def print_data_header(self, max_test_len):
-        fmt = "{:<%d}{:<10}" % (max_test_len + 5)
-        print(fmt.format("Name", "Result"))
+        fmt = '{:<%d}{:<10}' % (max_test_len + 5)
+        print(fmt.format('Name', 'Result'))
 
     # Propagate any data from this class that is needed for individual
     # tests. The reason this is needed is to avoid issues with attempting to
     # access a value in a different process.
     def prepare_input(self, name):
-        return {"num_samples": 1, "num_iters": self.num_iters}
+        return {'num_samples': 1, 'num_iters': self.num_iters}
 
     def run_test_inner(self, data, num_iters):
-        p = subprocess.Popen(
-            [
-                data["path"],
-                "--num-samples={}".format(data["num_samples"]),
-                "--num-iters={}".format(num_iters),
-                data["test_name"],
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
+        p = subprocess.Popen([
+            data['path'],
+            "--num-samples={}".format(data['num_samples']),
+            "--num-iters={}".format(num_iters), data['test_name']],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         error_out = p.communicate()[1].split("\n")
         result = p.returncode
         if result is None:
@@ -75,21 +75,20 @@ class QuickCheckBenchmarkDriver(perf_test_driver.BenchmarkDriver):
     def run_test(self, data, num_iters):
         try:
             args = [data, num_iters]
-            perf_test_driver.run_with_timeout(self.run_test_inner, args)
+            result = perf_test_driver.run_with_timeout(self.run_test_inner,
+                                                       args)
         except Exception, e:
-            sys.stderr.write(
-                "Child Process Failed! (%s,%s). Error: %s\n"
-                % (data["path"], data["test_name"], e)
-            )
+            sys.stderr.write("Child Process Failed! (%s,%s). Error: %s\n" % (
+                data['path'], data['test_name'], e))
             sys.stderr.flush()
             return None
         return True
 
     def process_input(self, data):
-        test_name = "({},{})".format(data["opt"], data["test_name"])
+        test_name = '({},{})'.format(data['opt'], data['test_name'])
         print("Running {}...".format(test_name))
         sys.stdout.flush()
-        if self.run_test(data, data["num_iters"]) is None:
+        if self.run_test(data, data['num_iters']) is None:
             return QuickCheckResult(test_name, success=False)
         return QuickCheckResult(test_name, success=True)
 
@@ -99,17 +98,13 @@ SWIFT_BIN_DIR = os.path.dirname(os.path.abspath(__file__))
 
 def parse_args():
     import argparse
-
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--filter",
-        type=str,
-        default=None,
-        help="Filter out any test that does not match the given regex",
-    )
-    parser.add_argument("--num-iters", type=int, default=2)
+        '--filter', type=str, default=None,
+        help='Filter out any test that does not match the given regex')
+    parser.add_argument('--num-iters', type=int, default=2)
     default_opt_levels = perf_test_driver.BenchmarkDriver_OptLevels
-    parser.add_argument("--opt-level", choices=default_opt_levels)
+    parser.add_argument('--opt-level', choices=default_opt_levels)
     return parser.parse_args()
 
 
@@ -118,10 +113,9 @@ if __name__ == "__main__":
     opt_levels = perf_test_driver.BenchmarkDriver_OptLevels
     if args.opt_level is not None:
         opt_levels = [args.opt_level]
-    driver = QuickCheckBenchmarkDriver(
-        SWIFT_BIN_DIR, XFAIL_LIST, args.num_iters, opt_levels
-    )
-    if driver.run(args.filter):
+    l = QuickCheckBenchmarkDriver(SWIFT_BIN_DIR, XFAIL_LIST, args.num_iters,
+                                  opt_levels)
+    if l.run(args.filter):
         sys.exit(0)
     else:
         sys.exit(-1)
diff --git a/benchmark/scripts/Benchmark_RuntimeLeaksRunner.in b/benchmark/scripts/Benchmark_RuntimeLeaksRunner.in
index 756af2348c6b5..2a7dd0d81d986 100644
--- a/benchmark/scripts/Benchmark_RuntimeLeaksRunner.in
+++ b/benchmark/scripts/Benchmark_RuntimeLeaksRunner.in
@@ -23,26 +23,26 @@ import perf_test_driver  # noqa (E402 module level import not at top of file)
 
 # This is a hacked up XFAIL list. It should really be a json file, but it will
 # work for now. Add in the exact name of the pass to XFAIL.
-XFAIL_LIST = []
+XFAIL_LIST = [
+]
 
 # Global Objective-C classes created by various frameworks. We do not care
 # about these.
-IGNORABLE_GLOBAL_OBJC_CLASSES = set(
-    [
-        "__NSPlaceholderDate",
-        "NSCache",
-        "__NSPlaceholderTimeZone",
-        "NSPlaceholderNumber",
-        "NSPlaceholderString",
-        "__NSPlaceholderArray",
-        "__NSPlaceholderDictionary",
-        "_NSPlaceholderData",
-        "_NSJSONReader",
-    ]
-)
+IGNORABLE_GLOBAL_OBJC_CLASSES = set([
+    '__NSPlaceholderDate',
+    'NSCache',
+    '__NSPlaceholderTimeZone',
+    'NSPlaceholderNumber',
+    'NSPlaceholderString',
+    '__NSPlaceholderArray',
+    '__NSPlaceholderDictionary',
+    '_NSPlaceholderData',
+    '_NSJSONReader'
+])
 
 
 class LeaksRunnerResult(perf_test_driver.Result):
+
     def __init__(self, name, count=None):
         # True = 1, False = 0.
         #
@@ -57,39 +57,36 @@ class LeaksRunnerResult(perf_test_driver.Result):
         return "N/A"
 
     def print_data(self, max_test_len):
-        fmt = "{:<%d}{:<10}{:}" % (max_test_len + 5)
-        print(fmt.format(self.get_name(), self.get_result(), self.get_count()))
+        fmt = '{:<%d}{:<10}{:}' % (max_test_len + 5)
+        print(fmt.format(self.get_name(), self.get_result(),
+                         self.get_count()))
 
 
 class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
+
     def __init__(self, binary, xfail_list, num_samples, num_iters):
         perf_test_driver.BenchmarkDriver.__init__(
-            self, binary, xfail_list, enable_parallel=True
-        )
+            self, binary, xfail_list,
+            enable_parallel=True)
         self.num_samples = num_samples
         self.num_iters = num_iters
 
     def print_data_header(self, max_test_len):
-        fmt = "{:<%d}{:<10}{:}" % (max_test_len + 5)
-        print(fmt.format("Name", "Result", "RC Delta"))
+        fmt = '{:<%d}{:<10}{:}' % (max_test_len + 5)
+        print(fmt.format('Name', 'Result', 'RC Delta'))
 
     # Propagate any data from this class that is needed for individual
     # tests. The reason this is needed is to avoid issues with attempting to
     # access a value in a different process.
     def prepare_input(self, name):
-        return {"num_samples": self.num_samples, "num_iters": self.num_iters}
+        return {'num_samples': self.num_samples, 'num_iters': self.num_iters}
 
     def run_test_inner(self, data, num_iters):
-        p = subprocess.Popen(
-            [
-                data["path"],
-                "--num-samples={}".format(data["num_samples"]),
-                "--num-iters={}".format(num_iters),
-                data["test_name"],
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
+        p = subprocess.Popen([
+            data['path'],
+            "--num-samples={}".format(data['num_samples']),
+            "--num-iters={}".format(num_iters), data['test_name']],
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         error_out = p.communicate()[1].split("\n")
         result = p.returncode
         if result is None:
@@ -101,12 +98,11 @@ class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
     def run_test(self, data, num_iters):
         try:
             args = [data, num_iters]
-            result = perf_test_driver.run_with_timeout(self.run_test_inner, args)
+            result = perf_test_driver.run_with_timeout(self.run_test_inner,
+                                                       args)
         except Exception, e:
-            sys.stderr.write(
-                "Child Process Failed! (%s,%s). Error: %s\n"
-                % (data["path"], data["test_name"], e)
-            )
+            sys.stderr.write("Child Process Failed! (%s,%s). Error: %s\n" % (
+                data['path'], data['test_name'], e))
             sys.stderr.flush()
             return None
 
@@ -114,27 +110,26 @@ class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
             # We grab the second line since swift globals get lazily created in
             # the first iteration.
             d = json.loads(result[1])
-            d["objc_objects"] = [
-                x for x in d["objc_objects"] if x not in IGNORABLE_GLOBAL_OBJC_CLASSES
-            ]
-            d["objc_count"] = len(d["objc_objects"])
+            d['objc_objects'] = [x for x in d['objc_objects']
+                                 if x not in IGNORABLE_GLOBAL_OBJC_CLASSES]
+            d['objc_count'] = len(d['objc_objects'])
 
-            total_count = d["objc_count"] + d["swift_count"]
+            total_count = d['objc_count'] + d['swift_count']
             return total_count
         except Exception:
-            tmp = (data["path"], data["test_name"])
+            tmp = (data['path'], data['test_name'])
             sys.stderr.write("Failed parse output! (%s,%s)\n" % tmp)
             sys.stderr.flush()
             return None
 
     def process_input(self, data):
-        test_name = "({},{})".format(data["opt"], data["test_name"])
+        test_name = '({},{})'.format(data['opt'], data['test_name'])
         print("Running {}...".format(test_name))
         sys.stdout.flush()
-        total_count1 = self.run_test(data, data["num_iters"])
+        total_count1 = self.run_test(data, data['num_iters'])
         if total_count1 is None:
             return LeaksRunnerResult(test_name)
-        total_count2 = self.run_test(data, data["num_iters"] + 1)
+        total_count2 = self.run_test(data, data['num_iters'] + 1)
         if total_count2 is None:
             return LeaksRunnerResult(test_name)
         return LeaksRunnerResult(test_name, total_count2 - total_count1)
@@ -145,24 +140,19 @@ SWIFT_BIN_DIR = os.path.dirname(os.path.abspath(__file__))
 
 def parse_args():
     import argparse
-
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-filter",
-        type=str,
-        default=None,
-        help="Filter out any test that does not match the given regex",
-    )
-    parser.add_argument("-num-samples", type=int, default=2)
-    parser.add_argument("-num-iters", type=int, default=2)
+        '-filter', type=str, default=None,
+        help='Filter out any test that does not match the given regex')
+    parser.add_argument('-num-samples', type=int, default=2)
+    parser.add_argument('-num-iters', type=int, default=2)
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     args = parse_args()
     driver = LeaksRunnerBenchmarkDriver(
-        SWIFT_BIN_DIR, XFAIL_LIST, args.num_samples, args.num_iters
-    )
+        SWIFT_BIN_DIR, XFAIL_LIST, args.num_samples, args.num_iters)
     if driver.run(args.filter):
         sys.exit(0)
     else:
diff --git a/benchmark/scripts/build_linux.py b/benchmark/scripts/build_linux.py
index 4404815931182..64bee4692bbb1 100755
--- a/benchmark/scripts/build_linux.py
+++ b/benchmark/scripts/build_linux.py
@@ -7,45 +7,39 @@
 
 def main():
     p = argparse.ArgumentParser()
-    p.add_argument("cmake_path", help="The cmake binary to use")
-    p.add_argument("swift_src_dir", help="The swift source directory")
-    p.add_argument("clang", help="The path to the clang binary to use")
-    p.add_argument(
-        "swift_root_dir",
-        help="A path to a swift root produced by installing "
-        "Swift and Foundation together. We infer swiftc "
-        "from here",
-    )
-    p.add_argument("destdir", help="The directory to perform the actual " "build in")
-    p.add_argument(
-        "--clean", action="store_true", help="Delete destdir before performing a build."
-    )
+    p.add_argument('cmake_path', help='The cmake binary to use')
+    p.add_argument('swift_src_dir', help='The swift source directory')
+    p.add_argument('clang', help='The path to the clang binary to use')
+    p.add_argument('swift_root_dir',
+                   help='A path to a swift root produced by installing '
+                        'Swift and Foundation together. We infer swiftc '
+                        'from here')
+    p.add_argument('destdir', help='The directory to perform the actual '
+                                   'build in')
+    p.add_argument('--clean', action='store_true',
+                   help='Delete destdir before performing a build.')
     args = p.parse_args()
 
     if args.clean:
         print("Asked to clean... Cleaning!")
-        subprocess.check_output(["/bin/rm", "-rfv", args.destdir])
-    subprocess.check_call(["/bin/mkdir", "-p", args.destdir])
+        subprocess.check_output(['/bin/rm', '-rfv', args.destdir])
+    subprocess.check_call(['/bin/mkdir', '-p', args.destdir])
     os.chdir(args.destdir)
     configureInvocation = [
-        args.cmake_path,
-        "-GNinja",
-        "-DSWIFT_EXEC={}/bin/swiftc".format(args.swift_root_dir),
-        "-DCLANG_EXEC={}".format(args.clang),
-        "-DSWIFT_LIBRARY_PATH={}/lib/swift".format(args.swift_root_dir),
-        "{}/benchmark".format(args.swift_src_dir),
+        args.cmake_path, '-GNinja',
+        '-DSWIFT_EXEC={}/bin/swiftc'.format(args.swift_root_dir),
+        '-DCLANG_EXEC={}'.format(args.clang),
+        '-DSWIFT_LIBRARY_PATH={}/lib/swift'.format(args.swift_root_dir),
+        '{}/benchmark'.format(args.swift_src_dir)
     ]
-    print("COMMAND: {}".format(" ".join(configureInvocation)))
+    print('COMMAND: {}'.format(' '.join(configureInvocation)))
     subprocess.check_call(configureInvocation)
 
     buildInvocation = [
-        args.cmake_path,
-        "--build",
-        args.destdir,
-        "--",
-        "swift-benchmark-linux-x86_64",
+        args.cmake_path, '--build', args.destdir, '--',
+        'swift-benchmark-linux-x86_64'
     ]
-    print("COMMAND: {}".format(" ".join(buildInvocation)))
+    print('COMMAND: {}'.format(' '.join(buildInvocation)))
     subprocess.check_call(buildInvocation)
 
 
diff --git a/benchmark/scripts/build_script_helper.py b/benchmark/scripts/build_script_helper.py
index 53bf7b19f6862..a3f999042289f 100755
--- a/benchmark/scripts/build_script_helper.py
+++ b/benchmark/scripts/build_script_helper.py
@@ -9,54 +9,50 @@
 
 
 def perform_build(args, swiftbuild_path, config, binary_name, opt_flag):
-    assert config in ["debug", "release"]
-    assert binary_name in ["Benchmark_O", "Benchmark_Osize", "Benchmark_Onone"]
-    assert opt_flag in ["-O", "-Osize", "-Onone"]
+    assert(config in ['debug', 'release'])
+    assert(binary_name in ['Benchmark_O', 'Benchmark_Osize',
+                           'Benchmark_Onone'])
+    assert(opt_flag in ['-O', '-Osize', '-Onone'])
 
     inner_build_dir = os.path.join(args.build_path, binary_name)
     swiftbuild_args = [
         swiftbuild_path,
-        "--package-path",
-        args.package_path,
-        "--build-path",
-        inner_build_dir,
-        "--configuration",
-        config,
-        "-Xswiftc",
-        "-Xllvm",
-        "-Xswiftc",
-        "-align-module-to-page-size",
-        "-Xswiftc",
-        opt_flag,
+        '--package-path', args.package_path,
+        '--build-path', inner_build_dir,
+        '--configuration', config,
+        '-Xswiftc', '-Xllvm',
+        '-Xswiftc', '-align-module-to-page-size',
+        '-Xswiftc', opt_flag,
     ]
     if args.verbose:
-        swiftbuild_args.append("--verbose")
+        swiftbuild_args.append('--verbose')
     subprocess.call(swiftbuild_args)
 
     # Copy the benchmark file into the final ./bin directory.
-    binpath = os.path.join(inner_build_dir, config, "SwiftBench")
-    finalpath = os.path.join(args.build_path, "bin", binary_name)
+    binpath = os.path.join(inner_build_dir, config, 'SwiftBench')
+    finalpath = os.path.join(args.build_path, 'bin', binary_name)
     shutil.copy(binpath, finalpath)
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--verbose", "-v", action="store_true")
-    parser.add_argument("--package-path", type=str, required=True)
-    parser.add_argument("--build-path", type=str, required=True)
-    parser.add_argument("--toolchain", type=str, required=True)
+    parser.add_argument('--verbose', '-v', action='store_true')
+    parser.add_argument('--package-path', type=str, required=True)
+    parser.add_argument('--build-path', type=str, required=True)
+    parser.add_argument('--toolchain', type=str, required=True)
 
     args = parser.parse_args()
 
     # Create our bin directory so we can copy in the binaries.
-    bin_dir = os.path.join(args.build_path, "bin")
+    bin_dir = os.path.join(args.build_path, 'bin')
     if not os.path.isdir(bin_dir):
         os.makedirs(bin_dir)
 
-    swiftbuild_path = os.path.join(args.toolchain, "usr", "bin", "swift-build")
-    perform_build(args, swiftbuild_path, "debug", "Benchmark_Onone", "-Onone")
-    perform_build(args, swiftbuild_path, "release", "Benchmark_Osize", "-Osize")
-    perform_build(args, swiftbuild_path, "release", "Benchmark_O", "-O")
+    swiftbuild_path = os.path.join(args.toolchain, 'usr', 'bin', 'swift-build')
+    perform_build(args, swiftbuild_path, 'debug', 'Benchmark_Onone', '-Onone')
+    perform_build(args, swiftbuild_path, 'release', 'Benchmark_Osize',
+                  '-Osize')
+    perform_build(args, swiftbuild_path, 'release', 'Benchmark_O', '-O')
 
 
 if __name__ == "__main__":
diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 69450cb4b97b5..017ba24c10229 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -37,7 +37,7 @@ class `ReportFormatter` creates the test comparison report in specified format.
 from math import ceil, sqrt
 
 
-class Sample(namedtuple("Sample", "i num_iters runtime")):
+class Sample(namedtuple('Sample', 'i num_iters runtime')):
     u"""Single benchmark measurement.
 
     Initialized with:
@@ -48,10 +48,10 @@ class Sample(namedtuple("Sample", "i num_iters runtime")):
 
     def __repr__(self):
         """Shorter Sample formating for debugging purposes."""
-        return "s({0.i!r}, {0.num_iters!r}, {0.runtime!r})".format(self)
+        return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
 
 
-class Yield(namedtuple("Yield", "before_sample after")):
+class Yield(namedtuple('Yield', 'before_sample after')):
     u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
 
     `before_sample`: index of measurement taken just after returning from yield
@@ -79,14 +79,13 @@ def __init__(self, name, samples=None):
     def __str__(self):
         """Text summary of benchmark statistics."""
         return (
-            "{0.name!s} n={0.count!r} "
-            "Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} "
-            "Max={0.max!r} "
-            "R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} "
-            "Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}".format(self)
-            if self.samples
-            else "{0.name!s} n=0".format(self)
-        )
+            '{0.name!s} n={0.count!r} '
+            'Min={0.min!r} Q1={0.q1!r} M={0.median!r} Q3={0.q3!r} '
+            'Max={0.max!r} '
+            'R={0.range!r} {0.spread:.2%} IQR={0.iqr!r} '
+            'Mean={0.mean:.0f} SD={0.sd:.0f} CV={0.cv:.2%}'
+            .format(self) if self.samples else
+            '{0.name!s} n=0'.format(self))
 
     def add(self, sample):
         """Add sample to collection and recompute statistics."""
@@ -98,9 +97,8 @@ def add(self, sample):
 
     def _update_stats(self, sample):
         old_stats = (self.count, self.mean, self.S_runtime)
-        _, self.mean, self.S_runtime = self.running_mean_variance(
-            old_stats, sample.runtime
-        )
+        _, self.mean, self.S_runtime = (
+            self.running_mean_variance(old_stats, sample.runtime))
 
     def exclude_outliers(self, top_only=False):
         """Exclude outliers by applying Interquartile Range Rule.
@@ -114,11 +112,8 @@ def exclude_outliers(self, top_only=False):
         benchmark runtimes in the microbenchmark range to filter out
         the environment noise caused by preemtive multitasking.
         """
-        lo = (
-            0
-            if top_only
-            else bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr))
-        )
+        lo = (0 if top_only else
+              bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)))
         hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
 
         outliers = self.samples[:lo] + self.samples[hi:]
@@ -186,7 +181,8 @@ def iqr(self):
     @property
     def sd(self):
         u"""Standard Deviation (μs)."""
-        return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))
+        return (0 if self.count < 2 else
+                sqrt(self.S_runtime / (self.count - 1)))
 
     @staticmethod
     def running_mean_variance((k, M_, S_), x):
@@ -233,13 +229,14 @@ class PerformanceTestResult(object):
     `--quantile`parameter. In both cases, the last column, MAX_RSS is optional.
     """
 
-    def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=False):
+    def __init__(self, csv_row, quantiles=False, memory=False, delta=False,
+                 meta=False):
         """Initialize from a row of multiple columns with benchmark summary.
 
         The row is an iterable, such as a row provided by the CSV parser.
         """
-        self.test_num = csv_row[0]  # Ordinal number of the test
-        self.name = csv_row[1]  # Name of the performance test
+        self.test_num = csv_row[0]          # Ordinal number of the test
+        self.name = csv_row[1]              # Name of the performance test
         self.num_samples = int(csv_row[2])  # Number of measurements taken
 
         if quantiles:  # Variable number of columns representing quantiles
@@ -247,63 +244,50 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=Fal
             runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:]
             if delta:
                 runtimes = [int(x) if x else 0 for x in runtimes]
-                runtimes = reduce(
-                    lambda l, x: l.append(l[-1] + x) or l if l else [x],  # runnin
-                    runtimes,
-                    None,
-                )  # total
+                runtimes = reduce(lambda l, x: l.append(l[-1] + x) or  # runnin
+                                  l if l else [x], runtimes, None)     # total
             num_values = len(runtimes)
             if self.num_samples < num_values:  # remove repeated samples
                 quantile = num_values - 1
                 qs = [float(i) / float(quantile) for i in range(0, num_values)]
-                indices = [
-                    max(0, int(ceil(self.num_samples * float(q))) - 1) for q in qs
-                ]
-                runtimes = [
-                    runtimes[indices.index(i)] for i in range(0, self.num_samples)
-                ]
+                indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
+                           for q in qs]
+                runtimes = [runtimes[indices.index(i)]
+                            for i in range(0, self.num_samples)]
 
             self.samples = PerformanceTestSamples(
-                self.name, [Sample(None, None, int(runtime)) for runtime in runtimes]
-            )
+                self.name,
+                [Sample(None, None, int(runtime)) for runtime in runtimes])
             self.samples.exclude_outliers(top_only=True)
             sams = self.samples
-            self.min, self.max, self.median, self.mean, self.sd = (
-                sams.min,
-                sams.max,
-                sams.median,
-                sams.mean,
-                sams.sd,
-            )
-            self.max_rss = (  # Maximum Resident Set Size (B)
-                int(csv_row[mem_index]) if memory else None
-            )
+            self.min, self.max, self.median, self.mean, self.sd = \
+                sams.min, sams.max, sams.median, sams.mean, sams.sd
+            self.max_rss = (                # Maximum Resident Set Size (B)
+                int(csv_row[mem_index]) if memory else None)
         else:  # Legacy format with statistics for normal distribution.
-            self.min = int(csv_row[3])  # Minimum runtime (μs)
-            self.max = int(csv_row[4])  # Maximum runtime (μs)
-            self.mean = float(csv_row[5])  # Mean (average) runtime (μs)
-            self.sd = float(csv_row[6])  # Standard Deviation (μs)
-            self.median = int(csv_row[7])  # Median runtime (μs)
-            self.max_rss = (  # Maximum Resident Set Size (B)
-                int(csv_row[8]) if len(csv_row) > 8 else None
-            )
+            self.min = int(csv_row[3])      # Minimum runtime (μs)
+            self.max = int(csv_row[4])      # Maximum runtime (μs)
+            self.mean = float(csv_row[5])   # Mean (average) runtime (μs)
+            self.sd = float(csv_row[6])     # Standard Deviation (μs)
+            self.median = int(csv_row[7])   # Median runtime (μs)
+            self.max_rss = (                # Maximum Resident Set Size (B)
+                int(csv_row[8]) if len(csv_row) > 8 else None)
             self.samples = None
 
         # Optional measurement metadata. The number of:
         # memory pages used, involuntary context switches and voluntary yields
-        self.mem_pages, self.involuntary_cs, self.yield_count = (
+        self.mem_pages, self.involuntary_cs, self.yield_count = \
             [int(x) for x in csv_row[-3:]] if meta else (None, None, None)
-        )
         self.yields = None
         self.setup = None
 
     def __repr__(self):
         """Short summary for debugging purposes."""
         return (
-            "<PerformanceTestResult name:{0.name!r} "
-            "samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} "
-            "mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>".format(self)
-        )
+            '<PerformanceTestResult name:{0.name!r} '
+            'samples:{0.num_samples!r} min:{0.min!r} max:{0.max!r} '
+            'mean:{0.mean:.0f} sd:{0.sd:.0f} median:{0.median!r}>'
+            .format(self))
 
     def merge(self, r):
         """Merge two results.
@@ -318,13 +302,8 @@ def merge(self, r):
             map(self.samples.add, r.samples.samples)
             sams = self.samples
             self.num_samples = sams.num_samples
-            self.min, self.max, self.median, self.mean, self.sd = (
-                sams.min,
-                sams.max,
-                sams.median,
-                sams.mean,
-                sams.sd,
-            )
+            self.min, self.max, self.median, self.mean, self.sd = \
+                sams.min, sams.max, sams.median, sams.mean, sams.sd
         else:
             self.min = min(self.min, r.min)
             self.max = max(self.max, r.max)
@@ -336,8 +315,8 @@ def merge(self, r):
 
         # Metadata
         def minimum(a, b):  # work around None being less than everything
-            return min(filter(lambda x: x is not None, [a, b])) if any([a, b]) else None
-
+            return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
+                    else None)
         self.max_rss = minimum(self.max_rss, r.max_rss)
         self.setup = minimum(self.setup, r.setup)
 
@@ -360,13 +339,12 @@ def __init__(self, old, new):
 
         # Test runtime improvement in %
         ratio = (new.min + 0.001) / (old.min + 0.001)
-        self.delta = (ratio - 1) * 100
+        self.delta = ((ratio - 1) * 100)
 
         # Indication of dubious changes: when result's MIN falls inside the
         # (MIN, MAX) interval of result they are being compared with.
-        self.is_dubious = (old.min < new.min and new.min < old.max) or (
-            new.min < old.min and old.min < new.max
-        )
+        self.is_dubious = ((old.min < new.min and new.min < old.max) or
+                           (new.min < old.min and old.min < new.max))
 
 
 class LogParser(object):
@@ -393,20 +371,15 @@ def _reset(self):
     # Parse lines like this
     # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs)
     results_re = re.compile(
-        r"( *\d+[, \t]+[\w.\-\?!]+[, \t]+"
-        + r"[, \t]+".join([r"\d+"] * 2)  # #,TEST
-        + r"(?:[, \t]+\d*)*)"  # at least 2...
-    )  # ...or more numeric columns
+        r'( *\d+[, \t]+[\w.\-\?!]+[, \t]+' +  # #,TEST
+        r'[, \t]+'.join([r'\d+'] * 2) +       # at least 2...
+        r'(?:[, \t]+\d*)*)')                  # ...or more numeric columns
 
     def _append_result(self, result):
-        columns = result.split(",") if "," in result else result.split()
+        columns = result.split(',') if ',' in result else result.split()
         r = PerformanceTestResult(
-            columns,
-            quantiles=self.quantiles,
-            memory=self.memory,
-            delta=self.delta,
-            meta=self.meta,
-        )
+            columns, quantiles=self.quantiles, memory=self.memory,
+            delta=self.delta, meta=self.meta)
         r.setup = self.setup
         r.max_rss = r.max_rss or self.max_rss
         r.mem_pages = r.mem_pages or self.mem_pages
@@ -424,43 +397,45 @@ def _store_memory_stats(self, max_rss, mem_pages):
         self.mem_pages = int(mem_pages)
 
     def _configure_format(self, header):
-        self.quantiles = "MEAN" not in header
-        self.memory = "MAX_RSS" in header
-        self.meta = "PAGES" in header
-        self.delta = "𝚫" in header
+        self.quantiles = 'MEAN' not in header
+        self.memory = 'MAX_RSS' in header
+        self.meta = 'PAGES' in header
+        self.delta = '𝚫' in header
 
     # Regular expression and action to take when it matches the parsed line
     state_actions = {
         results_re: _append_result,
+
         # Verbose mode adds new productions:
         # Adaptively determined N; test loop multiple adjusting runtime to ~1s
-        re.compile(r"\s+Measuring with scale (\d+)."): (
-            lambda self, num_iters: setattr(self, "num_iters", num_iters)
-        ),
-        re.compile(r"\s+Sample (\d+),(\d+)"): (
-            lambda self, i, runtime: self.samples.append(
-                Sample(int(i), int(self.num_iters), int(runtime))
-            )
-        ),
-        re.compile(r"\s+SetUp (\d+)"): (
-            lambda self, setup: setattr(self, "setup", int(setup))
-        ),
-        re.compile(r"\s+Yielding after ~(\d+) μs"): (
-            lambda self, since_last_yield: self.yields.append(
-                Yield(len(self.samples), int(since_last_yield))
-            )
-        ),
-        re.compile(r"( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)"): _configure_format,
+        re.compile(r'\s+Measuring with scale (\d+).'):
+        (lambda self, num_iters: setattr(self, 'num_iters', num_iters)),
+
+        re.compile(r'\s+Sample (\d+),(\d+)'):
+        (lambda self, i, runtime:
+         self.samples.append(
+             Sample(int(i), int(self.num_iters), int(runtime)))),
+
+        re.compile(r'\s+SetUp (\d+)'):
+        (lambda self, setup: setattr(self, 'setup', int(setup))),
+
+        re.compile(r'\s+Yielding after ~(\d+) μs'):
+        (lambda self, since_last_yield:
+            self.yields.append(
+                Yield(len(self.samples), int(since_last_yield)))),
+
+        re.compile(r'( *#[, \t]+TEST[, \t]+SAMPLES[, \t]+MIN.*)'):
+        _configure_format,
+
         # Environmental statistics: memory usage and context switches
-        re.compile(
-            r"\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)"
-        ): _store_memory_stats,
-        re.compile(r"\s+VCS \d+ - \d+ = (\d+)"): (
-            lambda self, vcs: setattr(self, "voluntary_cs", int(vcs))
-        ),
-        re.compile(r"\s+ICS \d+ - \d+ = (\d+)"): (
-            lambda self, ics: setattr(self, "involuntary_cs", int(ics))
-        ),
+        re.compile(r'\s+MAX_RSS \d+ - \d+ = (\d+) \((\d+) pages\)'):
+        _store_memory_stats,
+
+        re.compile(r'\s+VCS \d+ - \d+ = (\d+)'):
+        (lambda self, vcs: setattr(self, 'voluntary_cs', int(vcs))),
+
+        re.compile(r'\s+ICS \d+ - \d+ = (\d+)'):
+        (lambda self, ics: setattr(self, 'involuntary_cs', int(ics))),
     }
 
     def parse_results(self, lines):
@@ -536,10 +511,10 @@ def __init__(self, old_results, new_results, delta_threshold):
         added_tests = new_tests.difference(old_tests)
         removed_tests = old_tests.difference(new_tests)
 
-        self.added = sorted([new_results[t] for t in added_tests], key=lambda r: r.name)
-        self.removed = sorted(
-            [old_results[t] for t in removed_tests], key=lambda r: r.name
-        )
+        self.added = sorted([new_results[t] for t in added_tests],
+                            key=lambda r: r.name)
+        self.removed = sorted([old_results[t] for t in removed_tests],
+                              key=lambda r: r.name)
 
         def compare(name):
             return ResultComparison(old_results[name], new_results[name])
@@ -550,24 +525,19 @@ def partition(l, p):
             return reduce(lambda x, y: x[not p(y)].append(y) or x, l, ([], []))
 
         decreased, not_decreased = partition(
-            comparisons, lambda c: c.ratio < (1 - delta_threshold)
-        )
+            comparisons, lambda c: c.ratio < (1 - delta_threshold))
         increased, unchanged = partition(
-            not_decreased, lambda c: c.ratio > (1 + delta_threshold)
-        )
+            not_decreased, lambda c: c.ratio > (1 + delta_threshold))
 
         # sorted partitions
         names = [c.name for c in comparisons]
         comparisons = dict(zip(names, comparisons))
-        self.decreased = [
-            comparisons[c.name] for c in sorted(decreased, key=lambda c: -c.delta)
-        ]
-        self.increased = [
-            comparisons[c.name] for c in sorted(increased, key=lambda c: c.delta)
-        ]
-        self.unchanged = [
-            comparisons[c.name] for c in sorted(unchanged, key=lambda c: c.name)
-        ]
+        self.decreased = [comparisons[c.name]
+                          for c in sorted(decreased, key=lambda c: -c.delta)]
+        self.increased = [comparisons[c.name]
+                          for c in sorted(increased, key=lambda c: c.delta)]
+        self.unchanged = [comparisons[c.name]
+                          for c in sorted(unchanged, key=lambda c: c.name)]
 
 
 class ReportFormatter(object):
@@ -579,25 +549,23 @@ class ReportFormatter(object):
     GitHub), `git` and `html`.
     """
 
-    def __init__(self, comparator, changes_only, single_table=False):
+    def __init__(self, comparator, changes_only,
+                 single_table=False):
         """Initialize with `TestComparator` and names of branches."""
         self.comparator = comparator
         self.changes_only = changes_only
         self.single_table = single_table
 
-    PERFORMANCE_TEST_RESULT_HEADER = ("TEST", "MIN", "MAX", "MEAN", "MAX_RSS")
-    RESULT_COMPARISON_HEADER = ("TEST", "OLD", "NEW", "DELTA", "RATIO")
+    PERFORMANCE_TEST_RESULT_HEADER = ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
+    RESULT_COMPARISON_HEADER = ('TEST', 'OLD', 'NEW', 'DELTA', 'RATIO')
 
     @staticmethod
     def header_for(result):
         """Column labels for header row in results table."""
-        return (
-            ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER
-            if isinstance(result, PerformanceTestResult)
-            else
-            # isinstance(result, ResultComparison)
-            ReportFormatter.RESULT_COMPARISON_HEADER
-        )
+        return (ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER
+                if isinstance(result, PerformanceTestResult) else
+                # isinstance(result, ResultComparison)
+                ReportFormatter.RESULT_COMPARISON_HEADER)
 
     @staticmethod
     def values(result):
@@ -606,63 +574,53 @@ def values(result):
         Returns tuple of strings to display in the results table.
         """
         return (
-            (
-                result.name,
-                str(result.min),
-                str(result.max),
-                str(int(result.mean)),
-                str(result.max_rss) if result.max_rss else "—",
-            )
-            if isinstance(result, PerformanceTestResult)
-            else
+            (result.name,
+             str(result.min), str(result.max), str(int(result.mean)),
+             str(result.max_rss) if result.max_rss else '—')
+            if isinstance(result, PerformanceTestResult) else
             # isinstance(result, ResultComparison)
-            (
-                result.name,
-                str(result.old.min),
-                str(result.new.min),
-                "{0:+.1f}%".format(result.delta),
-                "{0:.2f}x{1}".format(result.ratio, " (?)" if result.is_dubious else ""),
-            )
+            (result.name,
+             str(result.old.min), str(result.new.min),
+             '{0:+.1f}%'.format(result.delta),
+             '{0:.2f}x{1}'.format(result.ratio,
+                                  ' (?)' if result.is_dubious else ''))
         )
 
     def markdown(self):
         """Report results of benchmark comparisons in Markdown format."""
         return self._formatted_text(
-            label_formatter=lambda s: ("**" + s + "**"),
-            COLUMN_SEPARATOR=" | ",
-            DELIMITER_ROW=([":---"] + ["---:"] * 4),
-            SEPARATOR="&nbsp; | | | | \n",
+            label_formatter=lambda s: ('**' + s + '**'),
+            COLUMN_SEPARATOR=' | ',
+            DELIMITER_ROW=([':---'] + ['---:'] * 4),
+            SEPARATOR='&nbsp; | | | | \n',
             SECTION="""
 <details {3}>
   <summary>{0} ({1})</summary>
   {2}
 </details>
-""",
-        )
+""")
 
     def git(self):
         """Report results of benchmark comparisons in 'git' format."""
         return self._formatted_text(
             label_formatter=lambda s: s.upper(),
-            COLUMN_SEPARATOR="   ",
+            COLUMN_SEPARATOR='   ',
             DELIMITER_ROW=None,
-            SEPARATOR="\n",
+            SEPARATOR='\n',
             SECTION="""
-{0} ({1}): \n{2}""",
-        )
+{0} ({1}): \n{2}""")
 
     def _column_widths(self):
         changed = self.comparator.decreased + self.comparator.increased
-        results = changed if self.changes_only else changed + self.comparator.unchanged
+        results = (changed if self.changes_only else
+                   changed + self.comparator.unchanged)
         results += self.comparator.added + self.comparator.removed
 
         widths = [
-            map(len, columns)
-            for columns in [
-                ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER,
-                ReportFormatter.RESULT_COMPARISON_HEADER,
-            ]
-            + [ReportFormatter.values(r) for r in results]
+            map(len, columns) for columns in
+            [ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER,
+             ReportFormatter.RESULT_COMPARISON_HEADER] +
+            [ReportFormatter.values(r) for r in results]
         ]
 
         def max_widths(maximum, widths):
@@ -670,9 +628,8 @@ def max_widths(maximum, widths):
 
         return reduce(max_widths, widths, [0] * 5)
 
-    def _formatted_text(
-        self, label_formatter, COLUMN_SEPARATOR, DELIMITER_ROW, SEPARATOR, SECTION
-    ):
+    def _formatted_text(self, label_formatter, COLUMN_SEPARATOR,
+                        DELIMITER_ROW, SEPARATOR, SECTION):
         widths = self._column_widths()
         self.header_printed = False
 
@@ -680,62 +637,43 @@ def justify_columns(contents):
             return [c.ljust(w) for w, c in zip(widths, contents)]
 
         def row(contents):
-            return (
-                ""
-                if not contents
-                else COLUMN_SEPARATOR.join(justify_columns(contents)) + "\n"
-            )
+            return ('' if not contents else
+                    COLUMN_SEPARATOR.join(justify_columns(contents)) + '\n')
 
         def header(title, column_labels):
-            labels = (
-                column_labels
-                if not self.single_table
-                else map(label_formatter, (title,) + column_labels[1:])
-            )
-            h = (
-                ("" if not self.header_printed else SEPARATOR)
-                + row(labels)
-                + (row(DELIMITER_ROW) if not self.header_printed else "")
-            )
+            labels = (column_labels if not self.single_table else
+                      map(label_formatter, (title, ) + column_labels[1:]))
+            h = (('' if not self.header_printed else SEPARATOR) +
+                 row(labels) +
+                 (row(DELIMITER_ROW) if not self.header_printed else ''))
             if self.single_table and not self.header_printed:
                 self.header_printed = True
             return h
 
         def format_columns(r, is_strong):
-            return r if not is_strong else r[:-1] + ("**" + r[-1] + "**",)
+            return (r if not is_strong else
+                    r[:-1] + ('**' + r[-1] + '**', ))
 
         def table(title, results, is_strong=False, is_open=False):
             if not results:
-                return ""
-            rows = [
-                row(format_columns(ReportFormatter.values(r), is_strong))
-                for r in results
-            ]
-            table = header(
-                title if self.single_table else "",
-                ReportFormatter.header_for(results[0]),
-            ) + "".join(rows)
-            return (
-                table
-                if self.single_table
-                else SECTION.format(
-                    title, len(results), table, "open" if is_open else ""
-                )
-            )
-
-        return "\n" + "".join(
-            [
-                table("Regression", self.comparator.decreased, True, True),
-                table("Improvement", self.comparator.increased, True),
-                (
-                    ""
-                    if self.changes_only
-                    else table("No Changes", self.comparator.unchanged)
-                ),
-                table("Added", self.comparator.added, is_open=True),
-                table("Removed", self.comparator.removed, is_open=True),
-            ]
-        )
+                return ''
+            rows = [row(format_columns(ReportFormatter.values(r), is_strong))
+                    for r in results]
+            table = (header(title if self.single_table else '',
+                            ReportFormatter.header_for(results[0])) +
+                     ''.join(rows))
+            return (table if self.single_table else
+                    SECTION.format(
+                        title, len(results), table, 'open' if is_open else ''))
+
+        return '\n' + ''.join([
+            table('Regression', self.comparator.decreased, True, True),
+            table('Improvement', self.comparator.increased, True),
+            ('' if self.changes_only else
+             table('No Changes', self.comparator.unchanged)),
+            table('Added', self.comparator.added, is_open=True),
+            table('Removed', self.comparator.removed, is_open=True)
+        ])
 
     HTML = """
 <!DOCTYPE html>
@@ -784,90 +722,68 @@ def table(title, results, is_strong=False, is_open=False):
 
     def html(self):
         """Report results of benchmark comparisons in HTML format."""
-
         def row(name, old, new, delta, speedup, speedup_color):
-            return self.HTML_ROW.format(name, old, new, delta, speedup_color, speedup)
+            return self.HTML_ROW.format(
+                name, old, new, delta, speedup_color, speedup)
 
         def header(contents):
-            return self.HTML_HEADER_ROW.format(*contents)
+            return self.HTML_HEADER_ROW.format(* contents)
 
         def table(title, results, speedup_color):
             rows = [
-                row(*(ReportFormatter.values(r) + (speedup_color,))) for r in results
+                row(*(ReportFormatter.values(r) + (speedup_color,)))
+                for r in results
             ]
-            return (
-                ""
-                if not rows
-                else header(
-                    (title, len(results)) + ReportFormatter.header_for(results[0])[1:]
-                )
-                + "".join(rows)
-            )
+            return ('' if not rows else
+                    header((title, len(results)) +
+                           ReportFormatter.header_for(results[0])[1:]) +
+                    ''.join(rows))
 
         return self.HTML.format(
-            "".join(
-                [
-                    table("Regression", self.comparator.decreased, "red"),
-                    table("Improvement", self.comparator.increased, "green"),
-                    (
-                        ""
-                        if self.changes_only
-                        else table("No Changes", self.comparator.unchanged, "black")
-                    ),
-                    table("Added", self.comparator.added, ""),
-                    table("Removed", self.comparator.removed, ""),
-                ]
-            )
-        )
+            ''.join([
+                table('Regression', self.comparator.decreased, 'red'),
+                table('Improvement', self.comparator.increased, 'green'),
+                ('' if self.changes_only else
+                 table('No Changes', self.comparator.unchanged, 'black')),
+                table('Added', self.comparator.added, ''),
+                table('Removed', self.comparator.removed, '')
+            ]))
 
 
 def parse_args(args):
     """Parse command line arguments and set default values."""
-    parser = argparse.ArgumentParser(description="Compare Performance tests.")
-    parser.add_argument(
-        "--old-file", help="Baseline performance test suite (csv file)", required=True
-    )
-    parser.add_argument(
-        "--new-file", help="New performance test suite (csv file)", required=True
-    )
-    parser.add_argument(
-        "--format",
-        choices=["markdown", "git", "html"],
-        help="Output format. Default is markdown.",
-        default="markdown",
-    )
-    parser.add_argument("--output", help="Output file name")
-    parser.add_argument(
-        "--changes-only", help="Output only affected tests", action="store_true"
-    )
-    parser.add_argument(
-        "--single-table",
-        help="Combine data in a single table in git and markdown formats",
-        action="store_true",
-    )
+    parser = argparse.ArgumentParser(description='Compare Performance tests.')
+    parser.add_argument('--old-file',
+                        help='Baseline performance test suite (csv file)',
+                        required=True)
+    parser.add_argument('--new-file',
+                        help='New performance test suite (csv file)',
+                        required=True)
+    parser.add_argument('--format',
+                        choices=['markdown', 'git', 'html'],
+                        help='Output format. Default is markdown.',
+                        default="markdown")
+    parser.add_argument('--output', help='Output file name')
+    parser.add_argument('--changes-only',
+                        help='Output only affected tests', action='store_true')
     parser.add_argument(
-        "--delta-threshold",
-        help="Delta threshold. Default 0.05.",
-        type=float,
-        default=0.05,
-    )
+        '--single-table',
+        help='Combine data in a single table in git and markdown formats',
+        action='store_true')
+    parser.add_argument('--delta-threshold',
+                        help='Delta threshold. Default 0.05.',
+                        type=float, default=0.05)
     return parser.parse_args(args)
 
 
-def create_report(
-    old_results,
-    new_results,
-    delta_threshold,
-    format,
-    changes_only=True,
-    single_table=True,
-):
+def create_report(old_results, new_results, delta_threshold, format,
+                  changes_only=True, single_table=True):
     comparator = TestComparator(old_results, new_results, delta_threshold)
     formatter = ReportFormatter(comparator, changes_only, single_table)
     formats = {
-        "markdown": formatter.markdown,
-        "git": formatter.git,
-        "html": formatter.html,
+        'markdown': formatter.markdown,
+        'git': formatter.git,
+        'html': formatter.html
     }
 
     report = formats[format]()
@@ -877,20 +793,16 @@ def create_report(
 def main():
     """Compare benchmarks for changes in a formatted report."""
     args = parse_args(sys.argv[1:])
-    report = create_report(
-        LogParser.results_from_file(args.old_file),
-        LogParser.results_from_file(args.new_file),
-        args.delta_threshold,
-        args.format,
-        args.changes_only,
-        args.single_table,
-    )
+    report = create_report(LogParser.results_from_file(args.old_file),
+                           LogParser.results_from_file(args.new_file),
+                           args.delta_threshold, args.format,
+                           args.changes_only, args.single_table)
     print(report)
 
     if args.output:
-        with open(args.output, "w") as f:
+        with open(args.output, 'w') as f:
             f.write(report)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     sys.exit(main())
diff --git a/benchmark/scripts/create_benchmark.py b/benchmark/scripts/create_benchmark.py
index cccaae23c76bd..2e2a4786752ae 100755
--- a/benchmark/scripts/create_benchmark.py
+++ b/benchmark/scripts/create_benchmark.py
@@ -7,7 +7,7 @@
 
 def main():
     p = argparse.ArgumentParser()
-    p.add_argument("name", help="The name of the new benchmark to be created")
+    p.add_argument('name', help='The name of the new benchmark to be created')
     args = p.parse_args()
 
     # adds benchmark to `CMakeLists.txt`
@@ -24,19 +24,19 @@ def update_cmakelists(name):
     """Adds a new entry to the `CMakeLists.txt` file with the given
     benchmark name.
     """
-    relative_path = create_relative_path("../CMakeLists.txt")
+    relative_path = create_relative_path('../CMakeLists.txt')
 
     file_contents = []
-    with open(relative_path, "r") as f:
+    with open(relative_path, 'r') as f:
         file_contents = f.readlines()
 
     file_new_contents = insert_line_alphabetically(
         name,
-        "    single-source/" + name + "\n",
+        '    single-source/' + name + '\n',
         file_contents,
-        r"    single-source\/([a-zA-Z]+)",
+        r"    single-source\/([a-zA-Z]+)"
     )
-    with open(relative_path, "w") as f:
+    with open(relative_path, 'w') as f:
         for line in file_new_contents:
             f.write(line)
 
@@ -46,17 +46,17 @@ def create_benchmark_file(name):
     and places it in the `single-source` directory.
     """
 
-    template_path = create_relative_path("Template.swift")
-    benchmark_template = ""
-    with open(template_path, "r") as f:
-        benchmark_template = "".join(f.readlines())
+    template_path = create_relative_path('Template.swift')
+    benchmark_template = ''
+    with open(template_path, 'r') as f:
+        benchmark_template = ''.join(f.readlines())
 
     # fill in template with benchmark name.
     formatted_template = benchmark_template.format(name=name)
 
-    relative_path = create_relative_path("../single-source/")
-    source_file_path = os.path.join(relative_path, name + ".swift")
-    with open(source_file_path, "w") as f:
+    relative_path = create_relative_path('../single-source/')
+    source_file_path = os.path.join(relative_path, name + '.swift')
+    with open(source_file_path, 'w') as f:
         f.write(formatted_template)
 
 
@@ -64,14 +64,14 @@ def add_import_benchmark(name):
     """Adds an `import` statement to the `main.swift` file for the new 
     benchmark.
     """
-    relative_path = create_relative_path("../utils/main.swift")
+    relative_path = create_relative_path('../utils/main.swift')
 
     # read current contents into an array
     file_contents = []
-    with open(relative_path, "r") as f:
+    with open(relative_path, 'r') as f:
         file_contents = f.readlines()
 
-    # the test dependencies are placed before all benchmarks, so we have to
+    # the test dependencies are placed before all benchmarks, so we have to 
     # insert the benchmark in the right alphabetical order after we have seen
     # all test dependencies.
     read_test_dependencies = False
@@ -82,27 +82,23 @@ def add_import_benchmark(name):
         match = re.search(r"import ([a-zA-Z]+)", line)
         if match and match.group(1):
             benchmark_name = match.group(1)
-            # find where to insert the new benchmark in the right alphabetical
+            # find where to insert the new benchmark in the right alphabetical 
             # order.
-            if (
-                name < benchmark_name
-                and previous_benchmark_name is None
-                or name < benchmark_name
-                and name > previous_benchmark_name
-            ):
+            if (name < benchmark_name and previous_benchmark_name is None or
+                    name < benchmark_name and name > previous_benchmark_name):
                 if read_test_dependencies:
-                    file_new_contents.append("import " + name + "\n" + line)
+                    file_new_contents.append('import ' + name + '\n' + line)
                 else:
-                    # all test dependencies are first specified, so from now
+                    # all test dependencies are first specified, so from now 
                     # on we can look where to insert the new benchmark.
                     read_test_dependencies = True
                     file_new_contents.append(line)
             else:
-                file_new_contents.append(line)
+                file_new_contents.append(line)    
             previous_benchmark_name = benchmark_name
         else:
             file_new_contents.append(line)
-    with open(relative_path, "w") as f:
+    with open(relative_path, 'w') as f:
         for line in file_new_contents:
             f.write(line)
 
@@ -111,19 +107,19 @@ def add_register_benchmark(name):
     """Adds an `import` statement to the `main.swift` file for the new
     benchmark.
     """
-    relative_path = create_relative_path("../utils/main.swift")
+    relative_path = create_relative_path('../utils/main.swift')
 
     file_contents = []
-    with open(relative_path, "r") as f:
+    with open(relative_path, 'r') as f:
         file_contents = f.readlines()
 
     file_new_contents = insert_line_alphabetically(
         name,
-        "registerBenchmark(" + name + ")\n",
-        file_contents,
-        r"registerBenchmark\(([a-zA-Z]+)\)",
+        'registerBenchmark(' + name + ')\n',
+        file_contents, 
+        r"registerBenchmark\(([a-zA-Z]+)\)"
     )
-    with open(relative_path, "w") as f:
+    with open(relative_path, 'w') as f:
         for line in file_new_contents:
             f.write(line)
 
@@ -133,7 +129,7 @@ def insert_line_alphabetically(name, new_line, lines, regex):
     find where the new benchmark should be inserted with the given `new_line`.
     """
     # the name of the previous seen benchmark in order to insert the new
-    # one at the correct position
+    # one at the correct position 
     previous_benchmark_name = None
     # the new contents of the file
     updated_lines = []
@@ -144,15 +140,11 @@ def insert_line_alphabetically(name, new_line, lines, regex):
             benchmark_name = match.group(1)
             # check if we're at the line where we have to insert the new
             # benchmark in the correct alphabetical order
-            if (
-                name < benchmark_name
-                and previous_benchmark_name is None
-                or name < benchmark_name
-                and name > previous_benchmark_name
-            ):
+            if (name < benchmark_name and previous_benchmark_name is None or 
+                    name < benchmark_name and name > previous_benchmark_name):
                 updated_lines.append(new_line + line)
             else:
-                updated_lines.append(line)
+                updated_lines.append(line)    
             previous_benchmark_name = benchmark_name
         else:
             updated_lines.append(line)
diff --git a/benchmark/scripts/generate_harness/generate_harness.py b/benchmark/scripts/generate_harness/generate_harness.py
index c5c6f87242133..6e4bc0f815c5e 100755
--- a/benchmark/scripts/generate_harness/generate_harness.py
+++ b/benchmark/scripts/generate_harness/generate_harness.py
@@ -21,12 +21,12 @@
 import subprocess
 
 script_dir = os.path.dirname(os.path.realpath(__file__))
-perf_dir = os.path.realpath(os.path.join(script_dir, "../.."))
-gyb = os.path.realpath(os.path.join(perf_dir, "../utils/gyb"))
+perf_dir = os.path.realpath(os.path.join(script_dir, '../..'))
+gyb = os.path.realpath(os.path.join(perf_dir, '../utils/gyb'))
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--output-dir", help="Output directory (for validation test)", default=perf_dir
-)
+parser.add_argument("--output-dir",
+                    help="Output directory (for validation test)",
+                    default=perf_dir)
 args = parser.parse_args()
 output_dir = args.output_dir
 
@@ -35,8 +35,7 @@ def all_files(directory, extension):  # matching: [directory]/**/*[extension]
     return [
         os.path.join(root, f)
         for root, _, files in os.walk(directory)
-        for f in files
-        if f.endswith(extension)
+        for f in files if f.endswith(extension)
     ]
 
 
@@ -47,13 +46,13 @@ def will_write(filename):  # ensure path to file exists before writing
         os.makedirs(output_path)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     # Generate Your Boilerplate
     # Make sure longer paths are done first as CMakeLists.txt and main.swift
     # depend on the other gybs being generated first.
-    gyb_files = sorted(all_files(perf_dir, ".gyb"), key=len, reverse=True)
+    gyb_files = sorted(all_files(perf_dir, '.gyb'), key=len, reverse=True)
     for f in gyb_files:
         relative_path = os.path.relpath(f[:-4], perf_dir)
         out_file = os.path.join(output_dir, relative_path)
         will_write(out_file)
-        subprocess.call([gyb, "--line-directive", "", "-o", out_file, f])
+        subprocess.call([gyb, '--line-directive', '', '-o', out_file, f])
diff --git a/benchmark/scripts/perf_test_driver/perf_test_driver.py b/benchmark/scripts/perf_test_driver/perf_test_driver.py
index 7f8929f771764..449059b031d74 100644
--- a/benchmark/scripts/perf_test_driver/perf_test_driver.py
+++ b/benchmark/scripts/perf_test_driver/perf_test_driver.py
@@ -21,27 +21,29 @@
 import subprocess
 
 
-BENCHMARK_OUTPUT_RE = re.compile(r"\d+,([^,]+)")
+BENCHMARK_OUTPUT_RE = re.compile(r'\d+,([^,]+)')
 
 
 class Result(object):
+
     def __init__(self, name, status, output, xfail_list):
         self.name = name
         self.status = status
         self.output = output
-        self.is_xfailed = any((re.match(x, self.name) is not None for x in xfail_list))
+        self.is_xfailed = any(
+            (re.match(x, self.name) is not None for x in xfail_list))
 
     def is_failure(self):
-        return self.get_result() in ["FAIL", "XPASS"]
+        return self.get_result() in ['FAIL', 'XPASS']
 
     def get_result(self):
         if self.is_xfailed:
             if self.status:
-                return "XFAIL"
-            return "XPASS"
+                return 'XFAIL'
+            return 'XPASS'
         if self.status:
-            return "FAIL"
-        return "PASS"
+            return 'FAIL'
+        return 'PASS'
 
     def get_name(self):
         return self.name
@@ -51,7 +53,7 @@ def merge_in_extra_data(self, d):
         return d
 
     def print_data(self, max_test_len):
-        fmt = "{:<%d}{:}" % (max_test_len + 5)
+        fmt = '{:<%d}{:}' % (max_test_len + 5)
         print(fmt.format(self.get_name(), self.get_result()))
 
 
@@ -63,44 +65,36 @@ def run_with_timeout(func, args):
     # we update to use python >= 3.3, use the timeout API on communicate
     # instead.
     import multiprocessing.dummy
-
     fakeThreadPool = multiprocessing.dummy.Pool(1)
     try:
         result = fakeThreadPool.apply_async(func, args=args)
         return result.get(timeout_seconds)
     except multiprocessing.TimeoutError:
         fakeThreadPool.terminate()
-        raise RuntimeError(
-            "Child process aborted due to timeout. "
-            "Timeout: %s seconds" % timeout_seconds
-        )
+        raise RuntimeError("Child process aborted due to timeout. "
+                           "Timeout: %s seconds" % timeout_seconds)
 
 
 def _unwrap_self(args):
     return type(args[0]).process_input(*args)
 
 
-BenchmarkDriver_OptLevels = ["Onone", "O", "Osize"]
+BenchmarkDriver_OptLevels = ['Onone', 'O', 'Osize']
 
 
 class BenchmarkDriver(object):
-    def __init__(
-        self,
-        binary_dir,
-        xfail_list,
-        enable_parallel=False,
-        opt_levels=BenchmarkDriver_OptLevels,
-    ):
-        self.targets = [
-            (os.path.join(binary_dir, "Benchmark_%s" % o), o) for o in opt_levels
-        ]
+
+    def __init__(self, binary_dir, xfail_list, enable_parallel=False,
+                 opt_levels=BenchmarkDriver_OptLevels):
+        self.targets = [(os.path.join(binary_dir, 'Benchmark_%s' % o), o)
+                        for o in opt_levels]
         self.xfail_list = xfail_list
         self.enable_parallel = enable_parallel
         self.data = None
 
     def print_data_header(self, max_test_len):
-        fmt = "{:<%d}{:}" % (max_test_len + 5)
-        print(fmt.format("Name", "Result"))
+        fmt = '{:<%d}{:}' % (max_test_len + 5)
+        print(fmt.format('Name', 'Result'))
 
     def prepare_input(self, name, opt_level):
         raise RuntimeError("Abstract method")
@@ -121,7 +115,7 @@ def run_for_opt_level(self, binary, opt_level, test_filter):
             names = [n for n in names if regex.match(n)]
 
         def prepare_input_wrapper(name):
-            x = {"opt": opt_level, "path": binary, "test_name": name}
+            x = {'opt': opt_level, 'path': binary, 'test_name': name}
             x.update(self.prepare_input(name))
             return x
 
@@ -135,31 +129,33 @@ def prepare_input_wrapper(name):
             results = map(self.process_input, prepared_input)
 
         def reduce_results(acc, r):
-            acc["result"].append(r)
-            acc["has_failure"] = acc["has_failure"] or r.is_failure()
-            acc["max_test_len"] = max(acc["max_test_len"], len(r.get_name()))
-            acc["extra_data"] = r.merge_in_extra_data(acc["extra_data"])
+            acc['result'].append(r)
+            acc['has_failure'] = acc['has_failure'] or r.is_failure()
+            acc['max_test_len'] = max(acc['max_test_len'], len(r.get_name()))
+            acc['extra_data'] = r.merge_in_extra_data(acc['extra_data'])
             return acc
 
-        return functools.reduce(
-            reduce_results,
-            results,
-            {"result": [], "has_failure": False, "max_test_len": 0, "extra_data": {}},
-        )
+        return functools.reduce(reduce_results, results, {
+            'result': [],
+            'has_failure': False,
+            'max_test_len': 0,
+            'extra_data': {}
+        })
 
     def print_data(self, data, max_test_len):
         print("Results:")
         self.print_data_header(max_test_len)
         for d in data:
-            for r in d["result"]:
+            for r in d['result']:
                 r.print_data(max_test_len)
 
     def run(self, test_filter=None):
         self.data = [
             self.run_for_opt_level(binary, opt_level, test_filter)
-            for binary, opt_level in self.targets
-        ]
-        max_test_len = functools.reduce(max, [d["max_test_len"] for d in self.data])
-        has_failure = functools.reduce(max, [d["has_failure"] for d in self.data])
+            for binary, opt_level in self.targets]
+        max_test_len = functools.reduce(max,
+                                        [d['max_test_len'] for d in self.data])
+        has_failure = functools.reduce(max,
+                                       [d['has_failure'] for d in self.data])
         self.print_data(self.data, max_test_len)
         return not has_failure
diff --git a/benchmark/scripts/run_smoke_bench b/benchmark/scripts/run_smoke_bench
index 0facbe7b344f5..f478c7e95a869 100755
--- a/benchmark/scripts/run_smoke_bench
+++ b/benchmark/scripts/run_smoke_bench
@@ -28,15 +28,14 @@ import glob
 import os
 import subprocess
 import sys
-from imp import load_source
 
 from compare_perf_tests import LogParser, TestComparator, create_report
 
+from imp import load_source
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
-    "Benchmark_Driver",
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), "Benchmark_Driver"),
-)
+    'Benchmark_Driver', os.path.join(os.path.dirname(
+        os.path.abspath(__file__)), 'Benchmark_Driver'))
 # from Benchmark_Driver import BenchmarkDriver, BenchmarkDoctor, ...
 BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
 BenchmarkDoctor = Benchmark_Driver.BenchmarkDoctor
@@ -47,12 +46,11 @@ VERBOSE = False
 
 class DriverArgs(object):
     """Arguments for BenchmarkDriver."""
-
-    def __init__(self, tests, optimization="O"):
+    def __init__(self, tests, optimization='O'):
         """Initialize with path to the build-dir and optimization level."""
         self.benchmarks = None
         self.filters = None
-        self.tests = os.path.join(tests, "bin")
+        self.tests = os.path.join(tests, 'bin')
         self.optimization = optimization
 
 
@@ -65,76 +63,49 @@ def main():
     global VERBOSE
     argparser = argparse.ArgumentParser()
     argparser.add_argument(
-        "-verbose", action="store_true", help="print verbose messages"
-    )
+        '-verbose', action='store_true',
+        help='print verbose messages')
     argparser.add_argument(
-        "-O",
-        action="append_const",
-        const="O",
-        dest="opt_levels",
-        help="test -O benchmarks",
-    )
+        '-O', action='append_const', const='O', dest='opt_levels',
+        help='test -O benchmarks')
     argparser.add_argument(
-        "-Osize",
-        action="append_const",
-        const="Osize",
-        dest="opt_levels",
-        help="test -Osize benchmarks",
-    )
+        '-Osize', action='append_const', const='Osize', dest='opt_levels',
+        help='test -Osize benchmarks')
     argparser.add_argument(
-        "-Onone",
-        action="append_const",
-        const="Onone",
-        dest="opt_levels",
-        help="test -Onone benchmarks (except code size)",
-    )
+        '-Onone', action='append_const', const='Onone', dest='opt_levels',
+        help='test -Onone benchmarks (except code size)')
     argparser.add_argument(
-        "-skip-code-size",
-        action="store_true",
-        help="Don't report code size differences",
-    )
+        '-skip-code-size', action='store_true',
+        help="Don't report code size differences")
     argparser.add_argument(
-        "-skip-performance",
-        action="store_true",
-        help="Don't report performance differences",
-    )
+        '-skip-performance', action='store_true',
+        help="Don't report performance differences")
     argparser.add_argument(
-        "-skip-check-added",
-        action="store_true",
-        help="Don't validate newly added benchmarks",
-    )
+        '-skip-check-added', action='store_true',
+        help="Don't validate newly added benchmarks")
     argparser.add_argument(
-        "-o",
-        type=str,
-        help="In addition to stdout, write the results into a markdown file",
-    )
+        '-o', type=str,
+        help='In addition to stdout, write the results into a markdown file')
     argparser.add_argument(
-        "-threshold",
-        type=float,
-        help="The performance threshold in %% which triggers a re-run",
-        default=5,
-    )
+        '-threshold', type=float,
+        help='The performance threshold in %% which triggers a re-run',
+        default=5)
     argparser.add_argument(
-        "-num-samples",
-        type=int,
-        help="The (minimum) number of samples to run",
-        default=3,
-    )
+        '-num-samples', type=int,
+        help='The (minimum) number of samples to run', default=3)
     argparser.add_argument(
-        "-num-reruns",
-        type=int,
+        '-num-reruns', type=int,
         help="The number of re-runs until it's assumed to be a real change",
-        default=8,
-    )
+        default=8)
     argparser.add_argument(
-        "-platform", type=str, help="The benchmark build platform", default="macosx"
-    )
+        '-platform', type=str,
+        help='The benchmark build platform', default='macosx')
     argparser.add_argument(
-        "oldbuilddir", nargs=1, type=str, help="old benchmark build directory"
-    )
+        'oldbuilddir', nargs=1, type=str,
+        help='old benchmark build directory')
     argparser.add_argument(
-        "newbuilddir", nargs=1, type=str, help="new benchmark build directory"
-    )
+        'newbuilddir', nargs=1, type=str,
+        help='new benchmark build directory')
     args = argparser.parse_args()
     VERBOSE = args.verbose
 
@@ -144,42 +115,29 @@ def main():
 def test_opt_levels(args):
     output_file = None
     if args.o:
-        output_file = open(args.o, "w")
+        output_file = open(args.o, 'w')
 
     changes = False
-    for opt_level in args.opt_levels or ["O", "Osize", "Onone"]:
-        log("####### Testing optimization level -" + opt_level + " #######")
+    for opt_level in args.opt_levels or ['O', 'Osize', 'Onone']:
+        log('####### Testing optimization level -' + opt_level + ' #######')
         if not args.skip_performance:
-            if test_performance(
-                opt_level,
-                args.oldbuilddir[0],
-                args.newbuilddir[0],
-                float(args.threshold) / 100,
-                args.num_samples,
-                args.num_reruns,
-                output_file,
-            ):
+            if test_performance(opt_level, args.oldbuilddir[0],
+                                args.newbuilddir[0],
+                                float(args.threshold) / 100, args.num_samples,
+                                args.num_reruns, output_file):
                 changes = True
 
         # There is no point in reporting code size for Onone.
-        if not args.skip_code_size and opt_level != "Onone":
-            if report_code_size(
-                opt_level,
-                args.oldbuilddir[0],
-                args.newbuilddir[0],
-                args.platform,
-                output_file,
-            ):
+        if not args.skip_code_size and opt_level != 'Onone':
+            if report_code_size(opt_level, args.oldbuilddir[0],
+                                args.newbuilddir[0],
+                                args.platform, output_file):
                 changes = True
 
     if not args.skip_code_size:
-        if report_code_size(
-            "swiftlibs",
-            args.oldbuilddir[0],
-            args.newbuilddir[0],
-            args.platform,
-            output_file,
-        ):
+        if report_code_size('swiftlibs', args.oldbuilddir[0],
+                            args.newbuilddir[0],
+                            args.platform, output_file):
             changes = True
 
     if not args.skip_check_added:
@@ -200,14 +158,10 @@ def measure(driver, tests, i):
     Collect increasing number of samples, depending on the iteration.
     """
     num_samples = min(i + 3, 10)
-    msg = "    Iteration {0} for {1}: num samples = {2}, ".format(
-        i, driver.args.tests, num_samples
-    )
-    msg += (
-        "running all tests"
-        if driver.all_tests == tests
-        else "re-testing {0} tests".format(len(tests))
-    )
+    msg = '    Iteration {0} for {1}: num samples = {2}, '.format(
+        i, driver.args.tests, num_samples)
+    msg += ('running all tests' if driver.all_tests == tests else
+            're-testing {0} tests'.format(len(tests)))
     log(msg)
     driver.tests = tests
     return driver.run(num_samples=num_samples, sample_time=0.0025)
@@ -220,9 +174,8 @@ def merge(results, other_results):
     return results
 
 
-def test_performance(
-    opt_level, old_dir, new_dir, threshold, num_samples, num_reruns, output_file
-):
+def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
+                     num_reruns, output_file):
     """Detect performance changes in benchmarks.
 
     Start fast with few samples per benchmark and gradually spend more time
@@ -230,10 +183,8 @@ def test_performance(
     """
 
     i, unchanged_length_count = 0, 0
-    old, new = [
-        BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
-        for dir in [old_dir, new_dir]
-    ]
+    old, new = [BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
+                for dir in [old_dir, new_dir]]
     results = [measure(driver, driver.tests, i) for driver in [old, new]]
     tests = TestComparator(results[0], results[1], threshold)
     changed = tests.decreased + tests.increased
@@ -241,11 +192,10 @@ def test_performance(
     while len(changed) > 0 and unchanged_length_count < num_reruns:
         i += 1
         if VERBOSE:
-            log("        test again: " + str([test.name for test in changed]))
-        results = [
-            merge(the_results, measure(driver, [test.name for test in changed], i))
-            for the_results, driver in zip(results, [old, new])
-        ]
+            log('        test again: ' + str([test.name for test in changed]))
+        results = [merge(the_results,
+                         measure(driver, [test.name for test in changed], i))
+                   for the_results, driver in zip(results, [old, new])]
         tests = TestComparator(results[0], results[1], threshold)
         changed = tests.decreased + tests.increased
 
@@ -254,19 +204,19 @@ def test_performance(
         else:
             unchanged_length_count = 0
 
-    log("")
-    return report_results(
-        "Performance: -" + opt_level, None, None, threshold * 1.4, output_file, *results
-    )
+    log('')
+    return report_results("Performance: -" + opt_level, None, None,
+                          threshold * 1.4, output_file, *results)
 
 
 def report_code_size(opt_level, old_dir, new_dir, platform, output_file):
-    if opt_level == "swiftlibs":
-        files = glob.glob(os.path.join(old_dir, "lib", "swift", platform, "*.dylib"))
+    if opt_level == 'swiftlibs':
+        files = glob.glob(os.path.join(old_dir, 'lib', 'swift', platform,
+                                       '*.dylib'))
     else:
-        files = glob.glob(
-            os.path.join(old_dir, opt_level + "-*" + platform + "*", "*.o")
-        )
+        files = glob.glob(os.path.join(old_dir,
+                                       opt_level + '-*' + platform + '*',
+                                       '*.o'))
 
     idx = 1
     old_lines = ""
@@ -279,44 +229,37 @@ def report_code_size(opt_level, old_dir, new_dir, platform, output_file):
             bname = os.path.basename(oldfile)
 
             def result_line(value):
-                v = "," + str(value)
-                return str(idx) + "," + bname + ",1" + (v * 3) + ",0" + v + "\n"
+                v = ',' + str(value)
+                return (str(idx) + ',' + bname + ',1' + (v * 3) +
+                        ',0' + v + '\n')
 
             old_lines += result_line(oldsize)
             new_lines += result_line(newsize)
             idx += 1
 
-    return report_results(
-        "Code size: -" + opt_level, old_lines, new_lines, 0.01, output_file
-    )
+    return report_results("Code size: -" + opt_level,
+                          old_lines, new_lines, 0.01, output_file)
 
 
 def get_codesize(filename):
-    output = subprocess.check_output(["size", filename]).splitlines()
+    output = subprocess.check_output(['size', filename]).splitlines()
     header_line = output[0]
     data_line = output[1]
-    if header_line.find("__TEXT") != 0:
-        sys.exit("unexpected output from size command:\n" + output)
-    return int(data_line.split("\t")[0])
-
-
-def report_results(
-    title,
-    old_lines,
-    new_lines,
-    threshold,
-    output_file,
-    old_results=None,
-    new_results=None,
-):
+    if header_line.find('__TEXT') != 0:
+        sys.exit('unexpected output from size command:\n' + output)
+    return int(data_line.split('\t')[0])
+
+
+def report_results(title, old_lines, new_lines, threshold, output_file,
+                   old_results=None, new_results=None):
     old_results = old_results or LogParser.results_from_string(old_lines)
     new_results = new_results or LogParser.results_from_string(new_lines)
 
     print("------- " + title + " -------")
-    print(create_report(old_results, new_results, threshold, "git"))
+    print(create_report(old_results, new_results, threshold, 'git'))
 
     if output_file:
-        report = create_report(old_results, new_results, threshold, "markdown")
+        report = create_report(old_results, new_results, threshold, 'markdown')
         if report != "":
             output_file.write("### " + title + "\n")
             output_file.write(report)
@@ -346,19 +289,17 @@ performance team (@eeckstein).
   <summary><strong>Hardware Overview</strong></summary>
 
 """
-    po = subprocess.check_output(["system_profiler", "SPHardwareDataType"])
+    po = subprocess.check_output(['system_profiler', 'SPHardwareDataType'])
     for line in po.splitlines():
-        selection = [
-            "Model Name",
-            "Model Identifier",
-            "Processor Name",
-            "Processor Speed",
-            "Number of Processors",
-            "Total Number of Cores",
-            "L2 Cache",
-            "L3 Cache",
-            "Memory:",
-        ]
+        selection = ['Model Name',
+                     'Model Identifier',
+                     'Processor Name',
+                     'Processor Speed',
+                     'Number of Processors',
+                     'Total Number of Cores',
+                     'L2 Cache',
+                     'L3 Cache',
+                     'Memory:']
 
         if any(s in line for s in selection):
             text += line + "\n"
@@ -379,5 +320,5 @@ def check_added(args, output_file=None):
     doctor.check()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     sys.exit(main())
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 570fee82f2f8b..32b1a9e527635 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -26,9 +26,8 @@
 
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
-    "Benchmark_Driver",
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), "Benchmark_Driver"),
-)
+    'Benchmark_Driver', os.path.join(os.path.dirname(
+        os.path.abspath(__file__)), 'Benchmark_Driver'))
 # from Benchmark_Driver import parse_args
 parse_args = Benchmark_Driver.parse_args
 BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
@@ -46,122 +45,121 @@ def assert_contains(self, texts, output):
     def test_requires_command_argument(self):
         with captured_output() as (_, err):
             self.assertRaises(SystemExit, parse_args, [])
-        self.assert_contains(["usage:", "COMMAND", "too few arguments"], err.getvalue())
+        self.assert_contains(['usage:', 'COMMAND', 'too few arguments'],
+                             err.getvalue())
 
     def test_command_help_lists_commands(self):
         with captured_output() as (out, _):
-            self.assertRaises(SystemExit, parse_args, ["-h"])
-        self.assert_contains(["COMMAND", "run", "compare", "check"], out.getvalue())
+            self.assertRaises(SystemExit, parse_args, ['-h'])
+        self.assert_contains(['COMMAND', 'run', 'compare', 'check'],
+                             out.getvalue())
 
     def test_run_benchmarks_by_name_or_ordinal(self):
-        benchmarks = ["AngryPhonebook", "42"]
-        self.assertEqual(parse_args(["run"] + benchmarks).benchmarks, benchmarks)
+        benchmarks = ['AngryPhonebook', '42']
+        self.assertEqual(
+            parse_args(['run'] + benchmarks).benchmarks, benchmarks)
 
     def test_run_benchmarks_matching_pattern(self):
-        regexes = ["Prefix", ".*Suffix.*"]
-        filters = ["-f", regexes[0], "-f", regexes[1]]
-        self.assertEqual(parse_args(["run"] + filters).filters, regexes)
+        regexes = ['Prefix', '.*Suffix.*']
+        filters = ['-f', regexes[0], '-f', regexes[1]]
+        self.assertEqual(parse_args(['run'] + filters).filters, regexes)
 
     def test_run_benchmarks_and_filters_are_exclusive(self):
         with captured_output() as (_, err):
-            self.assertRaises(
-                SystemExit, parse_args, "run -f Filter1 Benchmark1".split()
-            )
+            self.assertRaises(SystemExit,
+                              parse_args, 'run -f Filter1 Benchmark1'.split())
         self.assert_contains(
-            ["error", "argument BENCHMARK: not allowed with argument -f/--filter"],
-            err.getvalue(),
-        )
+            ['error',
+             'argument BENCHMARK: not allowed with argument -f/--filter'],
+            err.getvalue())
 
     def test_tests_location(self):
         here = os.path.dirname(os.path.abspath(__file__))
-        self.assertEqual(parse_args(["run"]).tests, here)
-        tests = "/benchmarks/are/here"
-        self.assertEqual(parse_args(["run", "-t", tests]).tests, tests)
+        self.assertEqual(parse_args(['run']).tests, here)
+        tests = '/benchmarks/are/here'
+        self.assertEqual(parse_args(['run', '-t', tests]).tests, tests)
 
     def test_optimization_argument(self):
-        self.assertEqual(parse_args(["run"]).optimization, "O")
-        self.assertEqual(parse_args(["run", "-o", "O"]).optimization, "O")
-        self.assertEqual(parse_args(["run", "-o", "Onone"]).optimization, "Onone")
-        self.assertEqual(parse_args(["run", "-o", "Osize"]).optimization, "Osize")
+        self.assertEqual(parse_args(['run']).optimization, 'O')
+        self.assertEqual(
+            parse_args(['run', '-o', 'O']).optimization, 'O')
+        self.assertEqual(
+            parse_args(['run', '-o', 'Onone']).optimization, 'Onone')
+        self.assertEqual(
+            parse_args(['run', '-o', 'Osize']).optimization, 'Osize')
 
         with captured_output() as (_, err):
-            self.assertRaises(SystemExit, parse_args, ["run", "-o", "bogus"])
+            self.assertRaises(SystemExit,
+                              parse_args, ['run', '-o', 'bogus'])
         self.assert_contains(
-            [
-                "error:",
-                "argument -o/--optimization: invalid choice: 'bogus'",
-                "(choose from 'O', 'Onone', 'Osize')",
-            ],
-            err.getvalue(),
-        )
+            ['error:',
+             "argument -o/--optimization: invalid choice: 'bogus'",
+             "(choose from 'O', 'Onone', 'Osize')"],
+            err.getvalue())
 
     def test_independent_samples(self):
-        self.assertEqual(parse_args(["run"]).independent_samples, 1)
-        self.assertEqual(parse_args(["run", "-i", "3"]).independent_samples, 3)
+        self.assertEqual(parse_args(['run']).independent_samples, 1)
+        self.assertEqual(parse_args(['run', '-i', '3']).independent_samples,
+                         3)
         with captured_output() as (out, err):
-            self.assertRaises(SystemExit, parse_args, ["run", "-i", "-3"])
+            self.assertRaises(SystemExit,
+                              parse_args, ['run', '-i', '-3'])
         self.assert_contains(
-            [
-                "error:",
-                "argument -i/--independent-samples: "
-                + "invalid positive_int value: '-3'",
-            ],
-            err.getvalue(),
-        )
+            ['error:', "argument -i/--independent-samples: " +
+             "invalid positive_int value: '-3'"],
+            err.getvalue())
 
     def test_output_dir(self):
-        self.assertIsNone(parse_args(["run"]).output_dir)
-        self.assertEqual(parse_args(["run", "--output-dir", "/log"]).output_dir, "/log")
+        self.assertIsNone(parse_args(['run']).output_dir)
+        self.assertEqual(
+            parse_args(['run', '--output-dir', '/log']).output_dir, '/log')
 
     def test_check_supports_vebose_output(self):
-        self.assertFalse(parse_args(["check"]).verbose)
-        self.assertTrue(parse_args(["check", "-v"]).verbose)
-        self.assertTrue(parse_args(["check", "--verbose"]).verbose)
+        self.assertFalse(parse_args(['check']).verbose)
+        self.assertTrue(parse_args(['check', '-v']).verbose)
+        self.assertTrue(parse_args(['check', '--verbose']).verbose)
 
     def test_check_supports_mardown_output(self):
-        self.assertFalse(parse_args(["check"]).markdown)
-        self.assertTrue(parse_args(["check", "-md"]).markdown)
-        self.assertTrue(parse_args(["check", "--markdown"]).markdown)
+        self.assertFalse(parse_args(['check']).markdown)
+        self.assertTrue(parse_args(['check', '-md']).markdown)
+        self.assertTrue(parse_args(['check', '--markdown']).markdown)
 
     def test_check_flags_are_mutually_exclusive(self):
         with captured_output() as (out, err):
-            self.assertRaises(SystemExit, parse_args, ["check", "-md", "-v"])
+            self.assertRaises(SystemExit,
+                              parse_args, ['check', '-md', '-v'])
         self.assert_contains(
-            [
-                "error:",
-                "argument -v/--verbose: " + "not allowed with argument -md/--markdown",
-            ],
-            err.getvalue(),
-        )
+            ['error:', 'argument -v/--verbose: ' +
+             'not allowed with argument -md/--markdown'],
+            err.getvalue())
 
 
 class ArgsStub(object):
     def __init__(self):
         self.benchmarks = None
         self.filters = None
-        self.tests = "/benchmarks/"
-        self.optimization = "O"
+        self.tests = '/benchmarks/'
+        self.optimization = 'O'
 
 
 class SubprocessMock(Mock):
     """Mock for subprocess module's `check_output` method."""
-
     STDOUT = object()
 
     def __init__(self, responses=None):
         super(SubprocessMock, self).__init__(responses)
 
-        def _check_output(args, stdin=None, stdout=None, stderr=None, shell=False):
+        def _check_output(args, stdin=None, stdout=None, stderr=None,
+                          shell=False):
             return self.record_and_respond(args, stdin, stdout, stderr, shell)
-
         self.check_output = _check_output
 
     def record_and_respond(self, args, stdin, stdout, stderr, shell):
         # _ = stdin, stdout, shell  # ignored in mock
-        assert stderr == self.STDOUT, "Errors are NOT redirected to STDOUT"
+        assert stderr == self.STDOUT, 'Errors are NOT redirected to STDOUT'
         args = tuple(args)
         self.calls.append(args)
-        return self.respond.get(args, "")
+        return self.respond.get(args, '')
 
 
 class TestBenchmarkDriverInitialization(unittest.TestCase):
@@ -171,95 +169,81 @@ def setUp(self):
 
     def test_test_harness(self):
         self.assertEqual(
-            BenchmarkDriver(self.args, tests=["ignored"]).test_harness,
-            "/benchmarks/Benchmark_O",
-        )
-        self.args.tests = "/path"
-        self.args.optimization = "Suffix"
+            BenchmarkDriver(self.args, tests=['ignored']).test_harness,
+            '/benchmarks/Benchmark_O')
+        self.args.tests = '/path'
+        self.args.optimization = 'Suffix'
         self.assertEqual(
-            BenchmarkDriver(self.args, tests=["ignored"]).test_harness,
-            "/path/Benchmark_Suffix",
-        )
+            BenchmarkDriver(self.args, tests=['ignored']).test_harness,
+            '/path/Benchmark_Suffix')
 
     def test_gets_list_of_precommit_benchmarks(self):
         self.subprocess_mock.expect(
-            "/benchmarks/Benchmark_O --list --delim=\t".split(" "),
-            "#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n",
-        )
-        driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
+            '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
+            '#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n')
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
         self.subprocess_mock.assert_called_all_expected()
-        self.assertEqual(driver.tests, ["Benchmark1", "Benchmark2"])
-        self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2"])
-        self.assertEquals(driver.test_number["Benchmark1"], "1")
-        self.assertEquals(driver.test_number["Benchmark2"], "2")
+        self.assertEqual(driver.tests,
+                         ['Benchmark1', 'Benchmark2'])
+        self.assertEqual(driver.all_tests,
+                         ['Benchmark1', 'Benchmark2'])
+        self.assertEquals(driver.test_number['Benchmark1'], "1")
+        self.assertEquals(driver.test_number['Benchmark2'], "2")
 
     list_all_tests = (
-        "/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "),
+        '/benchmarks/Benchmark_O --list --delim=\t --skip-tags='.split(' '),
         """#	Test	[Tags]
 1	Benchmark1	[t1, t2]
 2	Benchmark2	[t3]
 3	Benchmark3	[t3, t4]
-""",
-    )
+""")
 
     def test_gets_list_of_all_benchmarks_when_benchmarks_args_exist(self):
         """Filters tests by name or test number, ignoring unknown."""
-        self.args.benchmarks = "1 Benchmark3 1 bogus".split()
+        self.args.benchmarks = '1 Benchmark3 1 bogus'.split()
         self.subprocess_mock.expect(*self.list_all_tests)
-        driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
         self.subprocess_mock.assert_called_all_expected()
-        self.assertEqual(driver.tests, ["Benchmark1", "Benchmark3"])
-        self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2", "Benchmark3"])
+        self.assertEqual(driver.tests, ['Benchmark1', 'Benchmark3'])
+        self.assertEqual(driver.all_tests,
+                         ['Benchmark1', 'Benchmark2', 'Benchmark3'])
 
     def test_filters_benchmarks_by_pattern(self):
-        self.args.filters = "-f .+3".split()
+        self.args.filters = '-f .+3'.split()
         self.subprocess_mock.expect(*self.list_all_tests)
-        driver = BenchmarkDriver(self.args, _subprocess=self.subprocess_mock)
+        driver = BenchmarkDriver(
+            self.args, _subprocess=self.subprocess_mock)
         self.subprocess_mock.assert_called_all_expected()
-        self.assertEqual(driver.tests, ["Benchmark3"])
-        self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2", "Benchmark3"])
+        self.assertEqual(driver.tests, ['Benchmark3'])
+        self.assertEqual(driver.all_tests,
+                         ['Benchmark1', 'Benchmark2', 'Benchmark3'])
 
     def test_log_file(self):
         """When swift-repo is set, log is tied to Git branch and revision."""
-        self.assertIsNone(
-            BenchmarkDriver(
-                Stub(output_dir=None, tests="/bin/"), tests=["ignored"]
-            ).log_file
-        )
+        self.assertIsNone(BenchmarkDriver(
+            Stub(output_dir=None, tests='/bin/'), tests=['ignored']).log_file)
 
-        now = time.strftime("%Y%m%d%H%M%S", time.localtime())
+        now = time.strftime('%Y%m%d%H%M%S', time.localtime())
         driver = BenchmarkDriver(
-            Stub(
-                output_dir="/path",
-                tests="/bin/",
-                optimization="Suffix",
-                swift_repo=None,
-            ),
-            tests=["ignored"],
-        )
-        self.assertEqual(driver.log_file, "/path/Benchmark_Suffix-" + now + ".log")
-
-        r = "/repo/"
-        subprocess_mock = SubprocessMock(
-            responses=[
-                (
-                    "git -C {0} rev-parse --abbrev-ref HEAD".format(r).split(" "),
-                    "branch\n",
-                ),
-                (
-                    "git -C {0} rev-parse --short HEAD".format(r).split(" "),
-                    "short_hash\n",
-                ),
-            ]
-        )
+            Stub(output_dir='/path', tests='/bin/', optimization='Suffix',
+                 swift_repo=None,), tests=['ignored'])
+        self.assertEqual(driver.log_file,
+                         '/path/Benchmark_Suffix-' + now + '.log')
+
+        r = '/repo/'
+        subprocess_mock = SubprocessMock(responses=[
+            ('git -C {0} rev-parse --abbrev-ref HEAD'.format(r).split(' '),
+             'branch\n'),
+            ('git -C {0} rev-parse --short HEAD'.format(r).split(' '),
+             'short_hash\n'),
+        ])
         driver = BenchmarkDriver(
-            Stub(output_dir="/log/", tests="", optimization="S", swift_repo=r),
-            tests=["ignored"],
-            _subprocess=subprocess_mock,
-        )
-        self.assertEqual(
-            driver.log_file, "/log/branch/Benchmark_S-" + now + "-short_hash.log"
-        )
+            Stub(output_dir='/log/', tests='', optimization='S', swift_repo=r),
+            tests=['ignored'], _subprocess=subprocess_mock)
+        self.assertEqual(driver.log_file,
+                         '/log/branch/Benchmark_S-' + now + '-short_hash.log')
         subprocess_mock.assert_called_all_expected()
 
 
@@ -269,8 +253,8 @@ class LogParserStub(object):
     @staticmethod
     def results_from_string(log_contents):
         LogParserStub.results_from_string_called = True
-        r = PerformanceTestResult("3,b1,1,123,123,123,0,123".split(","))
-        return {"b1": r}
+        r = PerformanceTestResult('3,b1,1,123,123,123,0,123'.split(','))
+        return {'b1': r}
 
 
 class TestBenchmarkDriverRunningTests(unittest.TestCase):
@@ -279,38 +263,34 @@ def setUp(self):
         self.parser_stub = LogParserStub()
         self.subprocess_mock = SubprocessMock()
         self.subprocess_mock.expect(
-            "/benchmarks/Benchmark_O --list --delim=\t".split(" "),
-            "#\tTest\t[Tags]\n1\tb1\t[tag]\n",
-        )
+            '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
+            '#\tTest\t[Tags]\n1\tb1\t[tag]\n')
         self.driver = BenchmarkDriver(
-            self.args, _subprocess=self.subprocess_mock, parser=self.parser_stub
-        )
+            self.args, _subprocess=self.subprocess_mock,
+            parser=self.parser_stub)
 
     def test_run_benchmark_with_multiple_samples(self):
-        self.driver.run("b1")
-        self.subprocess_mock.assert_called_with(("/benchmarks/Benchmark_O", "b1"))
-        self.driver.run("b2", num_samples=5)
+        self.driver.run('b1')
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b1'))
+        self.driver.run('b2', num_samples=5)
         self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b2", "--num-samples=5")
-        )
+            ('/benchmarks/Benchmark_O', 'b2', '--num-samples=5'))
 
     def test_run_benchmark_with_specified_number_of_iterations(self):
-        self.driver.run("b", num_iters=1)
+        self.driver.run('b', num_iters=1)
         self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--num-iters=1")
-        )
+            ('/benchmarks/Benchmark_O', 'b', '--num-iters=1'))
 
     def test_run_benchmark_for_specified_time(self):
-        self.driver.run("b", sample_time=0.5)
+        self.driver.run('b', sample_time=0.5)
         self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--sample-time=0.5")
-        )
+            ('/benchmarks/Benchmark_O', 'b', '--sample-time=0.5'))
 
     def test_run_benchmark_in_verbose_mode(self):
-        self.driver.run("b", verbose=True)
+        self.driver.run('b', verbose=True)
         self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--verbose")
-        )
+            ('/benchmarks/Benchmark_O', 'b', '--verbose'))
 
     def test_run_batch(self):
         """Run all active tests in a single execution of the Benchmark_X.
@@ -318,9 +298,10 @@ def test_run_batch(self):
         Known test names are passed to the harness in a compressed form as test
         numbers.
         """
-        self.driver.tests = ["b1", "bx"]
+        self.driver.tests = ['b1', 'bx']
         self.driver.run()
-        self.subprocess_mock.assert_called_with(("/benchmarks/Benchmark_O", "1", "bx"))
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', '1', 'bx'))
 
     def test_parse_results_from_running_benchmarks(self):
         """Parse measurements results using LogParser.
@@ -328,70 +309,55 @@ def test_parse_results_from_running_benchmarks(self):
         Individual test run returns the first PerformanceTestResult directly.
         Batch run returns the dictionary of PerformanceTestResults.
         """
-        r = self.driver.run("b")
+        r = self.driver.run('b')
         self.assertTrue(self.parser_stub.results_from_string_called)
-        self.assertEquals(r.name, "b1")  # non-matching name, just 1st result
+        self.assertEquals(r.name, 'b1')  # non-matching name, just 1st result
         r = self.driver.run()
         self.assertTrue(isinstance(r, dict))
-        self.assertEquals(r["b1"].name, "b1")
+        self.assertEquals(r['b1'].name, 'b1')
 
     def test_measure_memory(self):
-        self.driver.run("b", measure_memory=True)
+        self.driver.run('b', measure_memory=True)
         self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--memory")
-        )
+            ('/benchmarks/Benchmark_O', 'b', '--memory'))
 
     def test_report_quantiles(self):
         """Use delta compression for quantile reports."""
-        self.driver.run("b", quantile=4)
+        self.driver.run('b', quantile=4)
         self.subprocess_mock.assert_called_with(
-            ("/benchmarks/Benchmark_O", "b", "--quantile=4", "--delta")
-        )
+            ('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta'))
 
     def test_run_benchmark_independent_samples(self):
         """Extract up to 20 measurements from an independent run."""
         self.driver.args.independent_samples = 3
-        r = self.driver.run_independent_samples("b1")
-        self.assertEqual(
-            self.subprocess_mock.calls.count(
-                (
-                    "/benchmarks/Benchmark_O",
-                    "b1",
-                    "--num-iters=1",
-                    "--memory",
-                    "--quantile=20",
-                    "--delta",
-                )
-            ),
-            3,
-        )
+        r = self.driver.run_independent_samples('b1')
+        self.assertEqual(self.subprocess_mock.calls.count(
+            ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
+             '--quantile=20', '--delta')), 3)
         self.assertEqual(r.num_samples, 3)  # results are merged
 
     def test_run_and_log(self):
         def mock_run(test):
-            self.assertEqual(test, "b1")
+            self.assertEqual(test, 'b1')
             return PerformanceTestResult(
-                "3,b1,5,101,1,1,1,1,888".split(","),
-                quantiles=True,
-                delta=True,
-                memory=True,
-            )
-
-        driver = BenchmarkDriver(tests=["b1"], args=Stub(output_dir=None))
+                '3,b1,5,101,1,1,1,1,888'.split(','),
+                quantiles=True, delta=True, memory=True)
+        driver = BenchmarkDriver(tests=['b1'], args=Stub(output_dir=None))
         driver.run_independent_samples = mock_run  # patching
 
         with captured_output() as (out, _):
             log = driver.run_and_log()
 
-        header = (
-            "#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs)," + "MAX_RSS(B)\n"
-        )
-        csv_log = "3,b1,5,101,102,103,104,105,888\n"
+        header = '#,TEST,SAMPLES,MIN(μs),Q1(μs),MEDIAN(μs),Q3(μs),MAX(μs),' +\
+            'MAX_RSS(B)\n'
+        csv_log = '3,b1,5,101,102,103,104,105,888\n'
         self.assertEqual(log, None)
         self.assertEqual(
             out.getvalue(),
-            header + csv_log + "\n" + "Total performance tests executed: 1\n",
-        )
+            header +
+            csv_log +
+            '\n' +
+            'Total performance tests executed: 1\n')
 
         with captured_output() as (out, _):
             log = driver.run_and_log(csv_console=False)
@@ -399,72 +365,66 @@ def mock_run(test):
         self.assertEqual(log, header + csv_log)
         self.assertEqual(
             out.getvalue(),
-            "  # TEST                                     SAMPLES MIN(μs)"
-            + " Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n"
-            + "  3 b1                                             5     101"
-            + "    102        103    104     105        888\n"
-            + "\n"
-            + "Total performance tests executed: 1\n",
-        )
+            '  # TEST                                     SAMPLES MIN(μs)' +
+            ' Q1(μs) MEDIAN(μs) Q3(μs) MAX(μs) MAX_RSS(B)\n' +
+            '  3 b1                                             5     101' +
+            '    102        103    104     105        888\n' +
+            '\n' +
+            'Total performance tests executed: 1\n')
 
     def test_log_results(self):
         """Create log directory if it doesn't exist and write the log file."""
-
         def assert_log_written(out, log_file, content):
-            self.assertEqual(out.getvalue(), "Logging results to: " + log_file + "\n")
-            with open(log_file, "rU") as f:
+            self.assertEqual(out.getvalue(),
+                             'Logging results to: ' + log_file + '\n')
+            with open(log_file, 'rU') as f:
                 text = f.read()
             self.assertEqual(text, "formatted output")
 
         try:
             import tempfile  # setUp
-
             temp_dir = tempfile.mkdtemp()
-            log_dir = os.path.join(temp_dir, "sub-dir/")
-            driver = BenchmarkDriver(Stub(), tests=[""])
+            log_dir = os.path.join(temp_dir, 'sub-dir/')
+            driver = BenchmarkDriver(Stub(), tests=[''])
 
             self.assertFalse(os.path.exists(log_dir))
             content = "formatted output"
-            log_file = os.path.join(log_dir, "1.log")
+            log_file = os.path.join(log_dir, '1.log')
             with captured_output() as (out, _):
                 driver.log_results(content, log_file=log_file)
             assert_log_written(out, log_file, content)
 
             self.assertTrue(os.path.exists(log_dir))
-            log_file = os.path.join(log_dir, "2.log")
+            log_file = os.path.join(log_dir, '2.log')
             with captured_output() as (out, _):
                 driver.log_results(content, log_file=log_file)
             assert_log_written(out, log_file, content)
 
         finally:
             import shutil  # tearDown
-
             shutil.rmtree(temp_dir)
 
     def test_deterministing_hashing(self):
-        cmd = ["printenv", "SWIFT_DETERMINISTIC_HASHING"]
-        driver = BenchmarkDriver(["no args"], tests=["ignored"])
-        self.assertEqual(driver._invoke(cmd).strip(), "1")
+        cmd = ['printenv', 'SWIFT_DETERMINISTIC_HASHING']
+        driver = BenchmarkDriver(['no args'], tests=['ignored'])
+        self.assertEqual(driver._invoke(cmd).strip(), '1')
 
 
 class BenchmarkDriverMock(Mock):
     """Mock for BenchmarkDriver's `run` method"""
-
     def __init__(self, tests, responses=None):
         super(BenchmarkDriverMock, self).__init__(responses)
         self.tests = tests
         self.args = ArgsStub()
 
-        def _run(
-            test, num_samples=None, num_iters=None, verbose=None, measure_memory=False
-        ):
-            return self.record_and_respond(
-                test, num_samples, num_iters, verbose, measure_memory
-            )
-
+        def _run(test, num_samples=None, num_iters=None,
+                 verbose=None, measure_memory=False):
+            return self.record_and_respond(test, num_samples, num_iters,
+                                           verbose, measure_memory)
         self.run = _run
 
-    def record_and_respond(self, test, num_samples, num_iters, verbose, measure_memory):
+    def record_and_respond(self, test, num_samples, num_iters,
+                           verbose, measure_memory):
         args = (test, num_samples, num_iters, verbose, measure_memory)
         self.calls.append(args)
         return self.respond.get(args, _PTR(min=700))
@@ -472,53 +432,35 @@ def record_and_respond(self, test, num_samples, num_iters, verbose, measure_memo
 
 class TestLoggingReportFormatter(unittest.TestCase):
     def test_plain_log_format(self):
-        lr = logging.makeLogRecord(
-            {
-                "name": "Base.category",
-                "level": logging.DEBUG,
-                "levelname": "DEBUG",
-                "msg": "Hi!",
-            }
-        )
+        lr = logging.makeLogRecord({
+            'name': 'Base.category', 'level': logging.DEBUG,
+            'levelname': 'DEBUG', 'msg': 'Hi!'})
         f = LoggingReportFormatter()
-        self.assertEqual(f.format(lr), "DEBUG category: Hi!")
+        self.assertEqual(f.format(lr), 'DEBUG category: Hi!')
 
     def test_colored_log_format(self):
         def record(level, level_name):
-            return logging.makeLogRecord(
-                {
-                    "name": "Base.category",
-                    "levelno": level,
-                    "levelname": level_name,
-                    "msg": "Hi!",
-                }
-            )
-
+            return logging.makeLogRecord({
+                'name': 'Base.category', 'levelno': level,
+                'levelname': level_name, 'msg': 'Hi!'})
         f = LoggingReportFormatter(use_color=True)
-        self.assertEqual(
-            f.format(record(logging.DEBUG, "DEBUG")), "\x1b[1;39mcategory: Hi!\x1b[1;0m"
-        )
-        self.assertEqual(
-            f.format(record(logging.INFO, "INFO")), "\x1b[1;32mcategory: Hi!\x1b[1;0m"
-        )
-        self.assertEqual(
-            f.format(record(logging.WARNING, "WARNING")),
-            "\x1b[1;33mcategory: Hi!\x1b[1;0m",
-        )
-        self.assertEqual(
-            f.format(record(logging.ERROR, "ERROR")), "\x1b[1;31mcategory: Hi!\x1b[1;0m"
-        )
-        self.assertEqual(
-            f.format(record(logging.CRITICAL, "CRITICAL")),
-            "\x1b[1;35mcategory: Hi!\x1b[1;0m",
-        )
+        self.assertEqual(f.format(record(logging.DEBUG, 'DEBUG')),
+                         '\x1b[1;39mcategory: Hi!\x1b[1;0m')
+        self.assertEqual(f.format(record(logging.INFO, 'INFO')),
+                         '\x1b[1;32mcategory: Hi!\x1b[1;0m')
+        self.assertEqual(f.format(record(logging.WARNING, 'WARNING')),
+                         '\x1b[1;33mcategory: Hi!\x1b[1;0m')
+        self.assertEqual(f.format(record(logging.ERROR, 'ERROR')),
+                         '\x1b[1;31mcategory: Hi!\x1b[1;0m')
+        self.assertEqual(f.format(record(logging.CRITICAL, 'CRITICAL')),
+                         '\x1b[1;35mcategory: Hi!\x1b[1;0m')
 
     def test_no_prefix_for_base_logging(self):
-        lr = logging.makeLogRecord(
-            {"name": "Base", "level": logging.INFO, "levelname": "INFO", "msg": "Hi!"}
-        )
+        lr = logging.makeLogRecord({
+            'name': 'Base', 'level': logging.INFO,
+            'levelname': 'INFO', 'msg': 'Hi!'})
         f = LoggingReportFormatter()
-        self.assertEqual(f.format(lr), "INFO Hi!")
+        self.assertEqual(f.format(lr), 'INFO Hi!')
 
 
 class TestMarkdownReportHandler(unittest.TestCase):
@@ -533,41 +475,44 @@ def assert_contains(self, texts):
             self.assertIn(text, self.stream.getvalue())
 
     def record(self, level, category, msg):
-        return logging.makeLogRecord(
-            {"name": "BenchmarkDoctor." + category, "levelno": level, "msg": msg}
-        )
+        return logging.makeLogRecord({
+            'name': 'BenchmarkDoctor.' + category,
+            'levelno': level, 'msg': msg})
 
     def test_init_writes_table_header(self):
         self.assertEqual(self.handler.level, logging.INFO)
-        self.assert_contains(["Benchmark Check Report\n", "---|---"])
+        self.assert_contains(['Benchmark Check Report\n', '---|---'])
 
     def test_close_writes_final_newlines(self):
         self.handler.close()
-        self.assert_contains(["---|---\n\n"])
+        self.assert_contains(['---|---\n\n'])
 
     def test_errors_and_warnings_start_new_rows_with_icons(self):
-        self.handler.emit(self.record(logging.ERROR, "", "Blunder"))
-        self.handler.emit(self.record(logging.WARNING, "", "Boo-boo"))
-        self.assert_contains(["\n⛔️ | Blunder", "\n⚠️ | Boo-boo"])
+        self.handler.emit(self.record(logging.ERROR, '', 'Blunder'))
+        self.handler.emit(self.record(logging.WARNING, '', 'Boo-boo'))
+        self.assert_contains(['\n⛔️ | Blunder',
+                              '\n⚠️ | Boo-boo'])
 
     def test_category_icons(self):
-        self.handler.emit(self.record(logging.WARNING, "naming", "naming"))
-        self.handler.emit(self.record(logging.WARNING, "runtime", "runtime"))
-        self.handler.emit(self.record(logging.WARNING, "memory", "memory"))
-        self.assert_contains(["🔤 | naming", "⏱ | runtime", "Ⓜ️ | memory"])
+        self.handler.emit(self.record(logging.WARNING, 'naming', 'naming'))
+        self.handler.emit(self.record(logging.WARNING, 'runtime', 'runtime'))
+        self.handler.emit(self.record(logging.WARNING, 'memory', 'memory'))
+        self.assert_contains(['🔤 | naming',
+                              '⏱ | runtime',
+                              'Ⓜ️ | memory'])
 
     def test_info_stays_in_table_cell_breaking_line_row_to_subscript(self):
         """Assuming Infos only follow after Errors and Warnings.
 
         Infos don't emit category icons.
         """
-        self.handler.emit(self.record(logging.ERROR, "naming", "Blunder"))
-        self.handler.emit(self.record(logging.INFO, "naming", "Fixit"))
-        self.assert_contains(["Blunder <br><sub> Fixit"])
+        self.handler.emit(self.record(logging.ERROR, 'naming', 'Blunder'))
+        self.handler.emit(self.record(logging.INFO, 'naming', 'Fixit'))
+        self.assert_contains(['Blunder <br><sub> Fixit'])
 
     def test_names_in_code_format(self):
-        self.handler.emit(self.record(logging.WARNING, "", "'QuotedName'"))
-        self.assert_contains(["| `QuotedName`"])
+        self.handler.emit(self.record(logging.WARNING, '', "'QuotedName'"))
+        self.assert_contains(['| `QuotedName`'])
 
 
 def _PTR(min=700, mem_pages=1000, setup=None):
@@ -575,17 +520,19 @@ def _PTR(min=700, mem_pages=1000, setup=None):
     return Stub(samples=Stub(min=min), mem_pages=mem_pages, setup=setup)
 
 
-def _run(test, num_samples=None, num_iters=None, verbose=None, measure_memory=False):
+def _run(test, num_samples=None, num_iters=None, verbose=None,
+         measure_memory=False):
     """Helper function that constructs tuple with arguments for run method."""
-    return (test, num_samples, num_iters, verbose, measure_memory)
+    return (
+        test, num_samples, num_iters, verbose, measure_memory)
 
 
 class TestBenchmarkDoctor(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         super(TestBenchmarkDoctor, cls).setUpClass()
-        doctor_log = logging.getLogger("BenchmarkDoctor")
-        cls._doctor_log_handler = MockLoggingHandler(level="DEBUG")
+        doctor_log = logging.getLogger('BenchmarkDoctor')
+        cls._doctor_log_handler = MockLoggingHandler(level='DEBUG')
         doctor_log.addHandler(cls._doctor_log_handler)
 
     def setUp(self):
@@ -600,33 +547,34 @@ def assert_contains(self, texts, output):
             self.assertIn(text, output)
 
     def test_uses_logging(self):
-        driver = BenchmarkDriverMock(tests=["B1", "B2"])
+        driver = BenchmarkDriverMock(tests=['B1', 'B2'])
         with captured_output() as (out, _):
             BenchmarkDoctor(self.args, driver)
-        self.assert_contains(["Checking tests: B1, B2"], self.logs["debug"])
-        self.assertEqual(out.getvalue(), "")
+        self.assert_contains(['Checking tests: B1, B2'], self.logs['debug'])
+        self.assertEqual(out.getvalue(), '')
 
     def test_supports_verbose_output(self):
-        driver = BenchmarkDriverMock(tests=["B1", "B2"])
+        driver = BenchmarkDriverMock(tests=['B1', 'B2'])
         driver.verbose = True
         self.args.verbose = True
         with captured_output() as (out, _):
             BenchmarkDoctor(self.args, driver)
-        self.assert_contains(["Checking tests: B1, B2"], out.getvalue())
+        self.assert_contains(['Checking tests: B1, B2'], out.getvalue())
 
     def test_uses_report_formatter(self):
-        doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=["B1"]))
-        console_handler = logging.getLogger("BenchmarkDoctor").handlers[1]
+        doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=['B1']))
+        console_handler = logging.getLogger('BenchmarkDoctor').handlers[1]
         self.assertTrue(doctor)
         self.assertTrue(isinstance(console_handler, logging.StreamHandler))
-        self.assertTrue(isinstance(console_handler.formatter, LoggingReportFormatter))
+        self.assertTrue(isinstance(console_handler.formatter,
+                                   LoggingReportFormatter))
 
     def test_uses_optional_markdown_report_formatter(self):
         self.args.markdown = True
         with captured_output() as (_, _):
-            doc = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=["B1"]))
+            doc = BenchmarkDoctor(self.args, BenchmarkDriverMock(tests=['B1']))
         self.assertTrue(doc)
-        console_handler = logging.getLogger("BenchmarkDoctor").handlers[1]
+        console_handler = logging.getLogger('BenchmarkDoctor').handlers[1]
         self.assertTrue(isinstance(console_handler, MarkdownReportHandler))
 
     def test_measure_10_independent_1s_benchmark_series(self):
@@ -636,155 +584,86 @@ def test_measure_10_independent_1s_benchmark_series(self):
         take measurements for approximately 1s
         based on short initial runtime sampling. Capped at 200 samples.
         """
-        driver = BenchmarkDriverMock(
-            tests=["B1"],
-            responses=(
-                [
-                    # calibration run, returns a stand-in for PerformanceTestResult
-                    (
-                        _run("B1", num_samples=3, num_iters=1, verbose=True),
-                        _PTR(min=300),
-                    )
-                ]
-                +
-                # 5x i1 series, with 300 μs runtime its possible to take 4098
-                # samples/s, but it should be capped at 2k
-                (
-                    [
-                        (
-                            _run(
-                                "B1",
-                                num_samples=200,
-                                num_iters=1,
-                                verbose=True,
-                                measure_memory=True,
-                            ),
-                            _PTR(min=300),
-                        )
-                    ]
-                    * 5
-                )
-                +
-                # 5x i2 series
-                (
-                    [
-                        (
-                            _run(
-                                "B1",
-                                num_samples=200,
-                                num_iters=2,
-                                verbose=True,
-                                measure_memory=True,
-                            ),
-                            _PTR(min=300),
-                        )
-                    ]
-                    * 5
-                )
-            ),
-        )
+        driver = BenchmarkDriverMock(tests=['B1'], responses=([
+            # calibration run, returns a stand-in for PerformanceTestResult
+            (_run('B1', num_samples=3, num_iters=1,
+                  verbose=True), _PTR(min=300))] +
+            # 5x i1 series, with 300 μs runtime its possible to take 4098
+            # samples/s, but it should be capped at 2k
+            ([(_run('B1', num_samples=200, num_iters=1,
+                    verbose=True, measure_memory=True), _PTR(min=300))] * 5) +
+            # 5x i2 series
+            ([(_run('B1', num_samples=200, num_iters=2,
+                    verbose=True, measure_memory=True), _PTR(min=300))] * 5)
+        ))
         doctor = BenchmarkDoctor(self.args, driver)
         with captured_output() as (out, _):
-            measurements = doctor.measure("B1")
+            measurements = doctor.measure('B1')
 
         driver.assert_called_all_expected()
         self.assert_contains(
-            [
-                "name",
-                "B1 O i1a",
-                "B1 O i1b",
-                "B1 O i1c",
-                "B1 O i1d",
-                "B1 O i1e",
-                "B1 O i2a",
-                "B1 O i2b",
-                "B1 O i2c",
-                "B1 O i2d",
-                "B1 O i2e",
-            ],
-            measurements.keys(),
-        )
-        self.assertEqual(measurements["name"], "B1")
+            ['name',
+             'B1 O i1a', 'B1 O i1b', 'B1 O i1c', 'B1 O i1d', 'B1 O i1e',
+             'B1 O i2a', 'B1 O i2b', 'B1 O i2c', 'B1 O i2d', 'B1 O i2e'],
+            measurements.keys())
+        self.assertEqual(measurements['name'], 'B1')
         self.assert_contains(
-            [
-                "Calibrating num-samples for B1:",
-                "Runtime 300 μs yields 4096 adjusted samples per second.",
-                "Measuring B1, 5 x i1 (200 samples), 5 x i2 (200 samples)",
-            ],
-            self.logs["debug"],
-        )
+            ['Calibrating num-samples for B1:',
+             'Runtime 300 μs yields 4096 adjusted samples per second.',
+             'Measuring B1, 5 x i1 (200 samples), 5 x i2 (200 samples)'],
+            self.logs['debug'])
 
     def test_benchmark_name_matches_naming_conventions(self):
-        driver = BenchmarkDriverMock(
-            tests=[
-                "BenchmarkName",
-                "CapitalWordsConvention",
-                "ABBRName",
-                "TooManyCamelCaseHumps",
-                "Existential.Array.method.1x.Val4",
-                "Flatten.Array.Array.Str.for-in.reserved",
-                "Flatten.Array.String?.as!.NSArray",
-                "wrongCase",
-                "Wrong_convention",
-                "Illegal._$%[]<>{}@^()",
-            ]
-        )
+        driver = BenchmarkDriverMock(tests=[
+            'BenchmarkName', 'CapitalWordsConvention', 'ABBRName',
+            'TooManyCamelCaseHumps',
+            'Existential.Array.method.1x.Val4',
+            'Flatten.Array.Array.Str.for-in.reserved',
+            'Flatten.Array.String?.as!.NSArray',
+            'wrongCase', 'Wrong_convention', 'Illegal._$%[]<>{}@^()'])
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, driver)
             doctor.check()
         output = out.getvalue()
 
-        self.assertIn("naming: ", output)
-        self.assertNotIn("BenchmarkName", output)
-        self.assertNotIn("CapitalWordsConvention", output)
-        self.assertNotIn("ABBRName", output)
-        self.assertNotIn("Existential.Array.method.1x.Val4", output)
-        self.assertNotIn("Flatten.Array.Array.Str.for-in.reserved", output)
-        self.assertNotIn("Flatten.Array.String?.as!.NSArray", output)
+        self.assertIn('naming: ', output)
+        self.assertNotIn('BenchmarkName', output)
+        self.assertNotIn('CapitalWordsConvention', output)
+        self.assertNotIn('ABBRName', output)
+        self.assertNotIn('Existential.Array.method.1x.Val4', output)
+        self.assertNotIn('Flatten.Array.Array.Str.for-in.reserved', output)
+        self.assertNotIn('Flatten.Array.String?.as!.NSArray', output)
         err_msg = " name doesn't conform to benchmark naming convention."
         self.assert_contains(
-            [
-                "'wrongCase'" + err_msg,
-                "'Wrong_convention'" + err_msg,
-                "'Illegal._$%[]<>{}@^()'" + err_msg,
-            ],
-            self.logs["error"],
-        )
+            ["'wrongCase'" + err_msg, "'Wrong_convention'" + err_msg,
+             "'Illegal._$%[]<>{}@^()'" + err_msg], self.logs['error'])
         self.assert_contains(
             ["'TooManyCamelCaseHumps' name is composed of 5 words."],
-            self.logs["warning"],
-        )
-        self.assert_contains(["See http://bit.ly/BenchmarkNaming"], self.logs["info"])
+            self.logs['warning'])
         self.assert_contains(
-            [
-                "Split 'TooManyCamelCaseHumps' name into dot-separated groups "
-                "and variants. See http://bit.ly/BenchmarkNaming"
-            ],
-            self.logs["info"],
-        )
+            ['See http://bit.ly/BenchmarkNaming'], self.logs['info'])
+        self.assert_contains(
+            ["Split 'TooManyCamelCaseHumps' name into dot-separated groups "
+             "and variants. See http://bit.ly/BenchmarkNaming"],
+            self.logs['info'])
 
     def test_benchmark_name_is_at_most_40_chars_long(self):
-        driver = BenchmarkDriverMock(
-            tests=["BenchmarkName", "ThisTestNameIsTooLongAndCausesOverflowsInReports"]
-        )
+        driver = BenchmarkDriverMock(tests=[
+            'BenchmarkName',
+            'ThisTestNameIsTooLongAndCausesOverflowsInReports'])
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, driver)
             doctor.check()
         output = out.getvalue()
 
-        self.assertIn("naming: ", output)
-        self.assertNotIn("BenchmarkName", output)
+        self.assertIn('naming: ', output)
+        self.assertNotIn('BenchmarkName', output)
         self.assert_contains(
-            [
-                "'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is "
-                "48 characters long."
-            ],
-            self.logs["error"],
-        )
+            ["'ThisTestNameIsTooLongAndCausesOverflowsInReports' name is "
+             "48 characters long."], self.logs['error'])
         self.assert_contains(
             ["Benchmark name should not be longer than 40 characters."],
-            self.logs["info"],
-        )
+            self.logs['info'])
 
     def test_benchmark_runtime_range(self):
         """Optimized benchmark should have runtime between 20 μs and 1000 μs.
@@ -800,226 +679,159 @@ def test_benchmark_runtime_range(self):
 
         Warn about longer runtime. Runtimes over 10ms are an error.
         """
-
         def measurements(name, runtime):
-            return {
-                "name": name,
-                name + " O i1a": _PTR(min=runtime + 2),
-                name + " O i2a": _PTR(min=runtime),
-            }
+            return {'name': name,
+                    name + ' O i1a': _PTR(min=runtime + 2),
+                    name + ' O i2a': _PTR(min=runtime)}
 
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
-            doctor.analyze(measurements("Sylph", 0))
-            doctor.analyze(measurements("Unicorn", 3))
-            doctor.analyze(measurements("Cheetah", 200))
-            doctor.analyze(measurements("Hare", 1001))
-            doctor.analyze(measurements("Tortoise", 500000))
-            doctor.analyze(
-                {
-                    "name": "OverheadTurtle",
-                    "OverheadTurtle O i1a": _PTR(min=800000),
-                    "OverheadTurtle O i2a": _PTR(min=700000),
-                }
-            )
+            doctor.analyze(measurements('Sylph', 0))
+            doctor.analyze(measurements('Unicorn', 3))
+            doctor.analyze(measurements('Cheetah', 200))
+            doctor.analyze(measurements('Hare', 1001))
+            doctor.analyze(measurements('Tortoise', 500000))
+            doctor.analyze({'name': 'OverheadTurtle',
+                            'OverheadTurtle O i1a': _PTR(min=800000),
+                            'OverheadTurtle O i2a': _PTR(min=700000)})
         output = out.getvalue()
 
-        self.assertIn("runtime: ", output)
-        self.assertNotIn("Cheetah", output)
-        self.assert_contains(["'Sylph' execution took 0 μs."], self.logs["error"])
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('Cheetah', output)
+        self.assert_contains(["'Sylph' execution took 0 μs."],
+                             self.logs['error'])
         self.assert_contains(
-            [
-                "Ensure the workload of 'Sylph' has a properly measurable size"
-                " (runtime > 20 μs) and is not eliminated by the compiler (use "
-                "`blackHole` function if necessary)."
-            ],
-            self.logs["info"],
-        )
-        self.assert_contains(["'Unicorn' execution took 3 μs."], self.logs["warning"])
+            ["Ensure the workload of 'Sylph' has a properly measurable size"
+             " (runtime > 20 μs) and is not eliminated by the compiler (use "
+             "`blackHole` function if necessary)."],
+            self.logs['info'])
+        self.assert_contains(["'Unicorn' execution took 3 μs."],
+                             self.logs['warning'])
         self.assert_contains(
             ["Increase the workload of 'Unicorn' to be more than 20 μs."],
-            self.logs["info"],
-        )
-        self.assert_contains(
-            ["'Hare' execution took at least 1001 μs."], self.logs["warning"]
-        )
+            self.logs['info'])
+        self.assert_contains(["'Hare' execution took at least 1001 μs."],
+                             self.logs['warning'])
         self.assert_contains(
-            [
-                "Decrease the workload of 'Hare' by a factor of 2 (10), "
-                "to be less than 1000 μs."
-            ],
-            self.logs["info"],
-        )
+            ["Decrease the workload of 'Hare' by a factor of 2 (10), "
+             "to be less than 1000 μs."], self.logs['info'])
         self.assert_contains(
-            ["'Tortoise' execution took at least 500000 μs."], self.logs["error"]
-        )
+            ["'Tortoise' execution took at least 500000 μs."],
+            self.logs['error'])
         self.assert_contains(
-            [
-                "Decrease the workload of 'Tortoise' by a factor of 512 (1000), "
-                "to be less than 1000 μs."
-            ],
-            self.logs["info"],
-        )
+            ["Decrease the workload of 'Tortoise' by a factor of 512 (1000), "
+             "to be less than 1000 μs."], self.logs['info'])
         self.assert_contains(
-            [
-                "'OverheadTurtle' execution took at least 600000 μs"
-                " (excluding the setup overhead)."
-            ],
-            self.logs["error"],
-        )
+            ["'OverheadTurtle' execution took at least 600000 μs"
+             " (excluding the setup overhead)."],
+            self.logs['error'])
 
     def test_benchmark_has_no_significant_setup_overhead(self):
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
-            doctor.analyze(
-                {
-                    "name": "NoOverhead",  # not 'significant' enough
-                    # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
-                    "NoOverhead O i1a": _PTR(min=162),
-                    "NoOverhead O i2a": _PTR(min=159),
-                }
-            )
-            doctor.analyze(
-                {
-                    "name": "SO",  # Setup Overhead
-                    # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
-                    "SO O i1a": _PTR(min=69),
-                    "SO O i1b": _PTR(min=70),
-                    "SO O i2a": _PTR(min=67),
-                    "SO O i2b": _PTR(min=68),
-                }
-            )
-            doctor.analyze(
-                {"name": "Zero", "Zero O i1a": _PTR(min=0), "Zero O i2a": _PTR(min=0)}
-            )
-            doctor.analyze(
-                {
-                    "name": "LOA",  # Limit of Accuracy
-                    # Impossible to detect overhead:
-                    # Even 1μs change in 20μs runtime is 5%.
-                    "LOA O i1a": _PTR(min=21),
-                    "LOA O i2a": _PTR(min=20),
-                }
-            )
+            doctor.analyze({
+                'name': 'NoOverhead',  # not 'significant' enough
+                # Based on DropFirstArray a10/e10: overhead 3.7% (6 μs)
+                'NoOverhead O i1a': _PTR(min=162),
+                'NoOverhead O i2a': _PTR(min=159)})
+            doctor.analyze({
+                'name': 'SO',  # Setup Overhead
+                # Based on SuffixArrayLazy a10/e10: overhead 5.8% (4 μs)
+                'SO O i1a': _PTR(min=69), 'SO O i1b': _PTR(min=70),
+                'SO O i2a': _PTR(min=67), 'SO O i2b': _PTR(min=68)})
+            doctor.analyze({'name': 'Zero', 'Zero O i1a': _PTR(min=0),
+                            'Zero O i2a': _PTR(min=0)})
+            doctor.analyze({
+                'name': 'LOA',  # Limit of Accuracy
+                # Impossible to detect overhead:
+                # Even 1μs change in 20μs runtime is 5%.
+                'LOA O i1a': _PTR(min=21),
+                'LOA O i2a': _PTR(min=20)})
         output = out.getvalue()
 
-        self.assertIn("runtime: ", output)
-        self.assertNotIn("NoOverhead", output)
-        self.assertNotIn("ZeroRuntime", output)
-        self.assertNotIn("LOA", output)
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('NoOverhead', output)
+        self.assertNotIn('ZeroRuntime', output)
+        self.assertNotIn('LOA', output)
         self.assert_contains(
-            ["'SO' has setup overhead of 4 μs (5.8%)."], self.logs["error"]
-        )
+            ["'SO' has setup overhead of 4 μs (5.8%)."],
+            self.logs['error'])
         self.assert_contains(
-            [
-                "Move initialization of benchmark data to the `setUpFunction` "
-                "registered in `BenchmarkInfo`."
-            ],
-            self.logs["info"],
-        )
+            ["Move initialization of benchmark data to the `setUpFunction` "
+             "registered in `BenchmarkInfo`."], self.logs['info'])
 
     def test_benchmark_setup_takes_reasonable_time(self):
         """Setup < 200 ms (20% extra on top of the typical 1 s measurement)"""
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
-            doctor.analyze(
-                {
-                    "name": "NormalSetup",
-                    "NormalSetup O i1a": _PTR(setup=199999),
-                    "NormalSetup O i2a": _PTR(setup=200001),
-                }
-            )
-            doctor.analyze(
-                {
-                    "name": "LongSetup",
-                    "LongSetup O i1a": _PTR(setup=200001),
-                    "LongSetup O i2a": _PTR(setup=200002),
-                }
-            )
+            doctor.analyze({
+                'name': 'NormalSetup',
+                'NormalSetup O i1a': _PTR(setup=199999),
+                'NormalSetup O i2a': _PTR(setup=200001)})
+            doctor.analyze({
+                'name': 'LongSetup',
+                'LongSetup O i1a': _PTR(setup=200001),
+                'LongSetup O i2a': _PTR(setup=200002)})
         output = out.getvalue()
 
-        self.assertIn("runtime: ", output)
-        self.assertNotIn("NormalSetup", output)
+        self.assertIn('runtime: ', output)
+        self.assertNotIn('NormalSetup', output)
         self.assert_contains(
-            ["'LongSetup' setup took at least 200001 μs."], self.logs["error"]
-        )
+            ["'LongSetup' setup took at least 200001 μs."],
+            self.logs['error'])
         self.assert_contains(
-            ["The `setUpFunction` should take no more than 200 ms."], self.logs["info"]
-        )
+            ["The `setUpFunction` should take no more than 200 ms."],
+            self.logs['info'])
 
     def test_benchmark_has_constant_memory_use(self):
         """Benchmark's memory footprint must not vary with num-iters."""
         with captured_output() as (out, _):
             doctor = BenchmarkDoctor(self.args, BenchmarkDriverMock([]))
-            doctor.analyze(
-                {
-                    # The threshold of 15 pages was estimated from previous
-                    # measurements. The normal range should be probably aproximated
-                    # by a function instead of a simple constant.
-                    # TODO: re-evaluate normal range from whole SBS
-                    "name": "ConstantMemory",
-                    "ConstantMemory O i1a": _PTR(mem_pages=1460),
-                    "ConstantMemory O i2a": _PTR(mem_pages=(1460 + 15)),
-                }
-            )
-            doctor.analyze(
-                {
-                    "name": "VariableMemory",  # ObserverForwardStruct
-                    "VariableMemory O i1a": _PTR(mem_pages=1460),
-                    "VariableMemory O i1b": _PTR(mem_pages=1472),
-                    # i2 series start at 290 pages higher
-                    "VariableMemory O i2a": _PTR(mem_pages=1750),
-                    "VariableMemory O i2b": _PTR(mem_pages=1752),
-                }
-            )
-            measurements = dict(
-                [
-                    (
-                        "HighVariance O i{0}{1}".format(num_iters, suffix),
-                        _PTR(mem_pages=num_pages),
-                    )
-                    for num_iters, pages in [
-                        (1, [6200, 5943, 4818, 5612, 5469]),
-                        (2, [6244, 5832, 4674, 5176, 5490]),
-                    ]
-                    for num_pages, suffix in zip(pages, list("abcde"))
-                ]
-            )
-            measurements["name"] = "HighVariance"  # Array2D
+            doctor.analyze({
+                # The threshold of 15 pages was estimated from previous
+                # measurements. The normal range should be probably aproximated
+                # by a function instead of a simple constant.
+                # TODO: re-evaluate normal range from whole SBS
+                'name': 'ConstantMemory',
+                'ConstantMemory O i1a': _PTR(mem_pages=1460),
+                'ConstantMemory O i2a': _PTR(mem_pages=(1460 + 15))})
+            doctor.analyze({
+                'name': 'VariableMemory',  # ObserverForwardStruct
+                'VariableMemory O i1a': _PTR(mem_pages=1460),
+                'VariableMemory O i1b': _PTR(mem_pages=1472),
+                # i2 series start at 290 pages higher
+                'VariableMemory O i2a': _PTR(mem_pages=1750),
+                'VariableMemory O i2b': _PTR(mem_pages=1752)})
+            measurements = dict([
+                ('HighVariance O i{0}{1}'.format(num_iters, suffix),
+                 _PTR(mem_pages=num_pages))
+                for num_iters, pages in [
+                    (1, [6200, 5943, 4818, 5612, 5469]),
+                    (2, [6244, 5832, 4674, 5176, 5490])]
+                for num_pages, suffix in zip(pages, list('abcde'))])
+            measurements['name'] = 'HighVariance'  # Array2D
             doctor.analyze(measurements)
         output = out.getvalue()
 
-        self.assertIn("memory: ", output)
-        self.assertNotIn("ConstantMemory", output)
+        self.assertIn('memory: ', output)
+        self.assertNotIn('ConstantMemory', output)
         self.assert_contains(
-            [
-                "'VariableMemory' varies the memory footprint of the base "
-                "workload depending on the `num-iters`."
-            ],
-            self.logs["error"],
-        )
+            ["'VariableMemory' varies the memory footprint of the base "
+             "workload depending on the `num-iters`."],
+            self.logs['error'])
         self.assert_contains(
-            [
-                "'VariableMemory' "
-                "mem_pages [i1, i2]: min=[1460, 1750] 𝚫=290 R=[12, 2]"
-            ],
-            self.logs["info"],
-        )
+            ["'VariableMemory' "
+             "mem_pages [i1, i2]: min=[1460, 1750] 𝚫=290 R=[12, 2]"],
+            self.logs['info'])
         self.assert_contains(
-            [
-                "'HighVariance' has very wide range of memory used between "
-                "independent, repeated measurements."
-            ],
-            self.logs["warning"],
-        )
+            ["'HighVariance' has very wide range of memory used between "
+             "independent, repeated measurements."],
+            self.logs['warning'])
         self.assert_contains(
-            [
-                "'HighVariance' "
-                "mem_pages [i1, i2]: min=[4818, 4674] 𝚫=144 R=[1382, 1570]"
-            ],
-            self.logs["info"],
-        )
+            ["'HighVariance' "
+             "mem_pages [i1, i2]: min=[4818, 4674] 𝚫=144 R=[1382, 1570]"],
+            self.logs['info'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 2053e93c0b42b..4c1c6effffcd5 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -48,11 +48,11 @@ def test_is_iterable(self):
 
 class TestPerformanceTestSamples(unittest.TestCase):
     def setUp(self):
-        self.samples = PerformanceTestSamples("B1")
+        self.samples = PerformanceTestSamples('B1')
         self.samples.add(Sample(7, 42, 1000))
 
     def test_has_name(self):
-        self.assertEqual(self.samples.name, "B1")
+        self.assertEqual(self.samples.name, 'B1')
 
     def test_stores_samples(self):
         self.assertEqual(self.samples.count, 1)
@@ -70,7 +70,7 @@ def test_quantile(self):
         self.assertEqual(self.samples.quantile(1), 1100)
         self.samples.add(Sample(3, 1, 1050))
         self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(0.5), 1050)
+        self.assertEqual(self.samples.quantile(.5), 1050)
         self.assertEqual(self.samples.quantile(1), 1100)
 
     def assertEqualFiveNumberSummary(self, ss, expected_fns):
@@ -82,15 +82,20 @@ def assertEqualFiveNumberSummary(self, ss, expected_fns):
         self.assertEqual(ss.max, e_max)
 
     def test_computes_five_number_summary(self):
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1000, 1000))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1000, 1000, 1000))
         self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1000, 1100, 1100))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1000, 1100, 1100))
         self.samples.add(Sample(3, 1, 1050))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1050, 1100, 1100))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1050, 1100, 1100))
         self.samples.add(Sample(4, 1, 1025))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1000, 1025, 1050, 1100))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1000, 1025, 1050, 1100))
         self.samples.add(Sample(5, 1, 1075))
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1075, 1100))
 
     def test_computes_inter_quartile_range(self):
         self.assertEqual(self.samples.iqr, 0)
@@ -106,66 +111,59 @@ def assertEqualtats(self, stats, expected_stats):
 
     def test_computes_mean_sd_cv(self):
         ss = self.samples
-        self.assertEqualtats((ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
+        self.assertEqualtats(
+            (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
         self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualtats((ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
+        self.assertEqualtats(
+            (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
 
     def test_computes_range_spread(self):
         ss = self.samples
-        self.assertEqualtats((ss.range, ss.spread), (0, 0))
+        self.assertEqualtats(
+            (ss.range, ss.spread), (0, 0))
         self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualtats((ss.range, ss.spread), (100, 10.0 / 100))
+        self.assertEqualtats(
+            (ss.range, ss.spread), (100, 10.0 / 100))
 
     def test_init_with_samples(self):
         self.samples = PerformanceTestSamples(
-            "B2", [Sample(0, 1, 1000), Sample(1, 1, 1100)]
-        )
+            'B2', [Sample(0, 1, 1000), Sample(1, 1, 1100)])
         self.assertEqual(self.samples.count, 2)
         self.assertEqualtats(
-            (
-                self.samples.mean,
-                self.samples.sd,
-                self.samples.range,
-                self.samples.spread,
-            ),
-            (1050.0, 70.71, 100, 9.52 / 100),
-        )
+            (self.samples.mean, self.samples.sd,
+             self.samples.range, self.samples.spread),
+            (1050.0, 70.71, 100, 9.52 / 100))
 
     def test_can_handle_zero_runtime(self):
         # guard against dividing by 0
-        self.samples = PerformanceTestSamples("Zero")
+        self.samples = PerformanceTestSamples('Zero')
         self.samples.add(Sample(0, 1, 0))
         self.assertEqualtats(
-            (
-                self.samples.mean,
-                self.samples.sd,
-                self.samples.cv,
-                self.samples.range,
-                self.samples.spread,
-            ),
-            (0, 0, 0.0, 0, 0.0),
-        )
+            (self.samples.mean, self.samples.sd, self.samples.cv,
+             self.samples.range, self.samples.spread),
+            (0, 0, 0.0, 0, 0.0))
 
     def test_excludes_outliers(self):
-        ss = [
-            Sample(*map(int, s.split()))
-            for s in "0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, "
-            "5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, "
-            "10 1 1050, 11 1 949, 12 1 1151".split(",")
-        ]
-        self.samples = PerformanceTestSamples("Outliers", ss)
+        ss = [Sample(*map(int, s.split())) for s in
+              '0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, '
+              '5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, '
+              '10 1 1050, 11 1 949, 12 1 1151'.split(',')]
+        self.samples = PerformanceTestSamples('Outliers', ss)
         self.assertEqual(self.samples.count, 13)
-        self.assertEqualtats((self.samples.mean, self.samples.sd), (1050, 52.36))
+        self.assertEqualtats(
+            (self.samples.mean, self.samples.sd), (1050, 52.36))
 
         self.samples.exclude_outliers()
 
         self.assertEqual(self.samples.count, 11)
         self.assertEqual(self.samples.outliers, ss[11:])
-        self.assertEqualFiveNumberSummary(self.samples, (1000, 1025, 1050, 1075, 1100))
-        self.assertEqualtats((self.samples.mean, self.samples.sd), (1050, 35.36))
+        self.assertEqualFiveNumberSummary(
+            self.samples, (1000, 1025, 1050, 1075, 1100))
+        self.assertEqualtats(
+            (self.samples.mean, self.samples.sd), (1050, 35.36))
 
     def test_excludes_outliers_zero_IQR(self):
-        self.samples = PerformanceTestSamples("Tight")
+        self.samples = PerformanceTestSamples('Tight')
         self.samples.add(Sample(0, 2, 23))
         self.samples.add(Sample(1, 2, 18))
         self.samples.add(Sample(2, 2, 18))
@@ -175,14 +173,13 @@ def test_excludes_outliers_zero_IQR(self):
         self.samples.exclude_outliers()
 
         self.assertEqual(self.samples.count, 3)
-        self.assertEqualtats((self.samples.min, self.samples.max), (18, 18))
+        self.assertEqualtats(
+            (self.samples.min, self.samples.max), (18, 18))
 
     def test_excludes_outliers_top_only(self):
-        ss = [
-            Sample(*map(int, s.split()))
-            for s in "0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3".split(",")
-        ]
-        self.samples = PerformanceTestSamples("Top", ss)
+        ss = [Sample(*map(int, s.split())) for s in
+              '0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3'.split(',')]
+        self.samples = PerformanceTestSamples('Top', ss)
         self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
         self.assertEqual(self.samples.iqr, 0)
 
@@ -194,52 +191,48 @@ def test_excludes_outliers_top_only(self):
 
 class TestPerformanceTestResult(unittest.TestCase):
     def test_init(self):
-        log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
-        r = PerformanceTestResult(log_line.split(","))
-        self.assertEqual(r.test_num, "1")
-        self.assertEqual(r.name, "AngryPhonebook")
+        log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
+        r = PerformanceTestResult(log_line.split(','))
+        self.assertEqual(r.test_num, '1')
+        self.assertEqual(r.name, 'AngryPhonebook')
         self.assertEqual(
             (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
-            (20, 10664, 12933, 11035, 576, 10884),
-        )
+            (20, 10664, 12933, 11035, 576, 10884))
         self.assertEqual(r.samples, None)
 
-        log_line = "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336"
-        r = PerformanceTestResult(log_line.split(","))
+        log_line = '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
+        r = PerformanceTestResult(log_line.split(','))
         self.assertEqual(r.max_rss, 10510336)
 
     def test_init_quantiles(self):
         # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
-        log = "1,Ackermann,3,54383,54512,54601"
-        r = PerformanceTestResult(log.split(","), quantiles=True)
-        self.assertEqual(r.test_num, "1")
-        self.assertEqual(r.name, "Ackermann")
-        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max), (3, 54383, 54512, 54601)
-        )
+        log = '1,Ackermann,3,54383,54512,54601'
+        r = PerformanceTestResult(log.split(','), quantiles=True)
+        self.assertEqual(r.test_num, '1')
+        self.assertEqual(r.name, 'Ackermann')
+        self.assertEqual((r.num_samples, r.min, r.median, r.max),
+                         (3, 54383, 54512, 54601))
         self.assertAlmostEquals(r.mean, 54498.67, places=2)
         self.assertAlmostEquals(r.sd, 109.61, places=2)
         self.assertEqual(r.samples.count, 3)
         self.assertEqual(r.samples.num_samples, 3)
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
-        )
+        self.assertEqual([s.runtime for s in r.samples.all_samples],
+                         [54383, 54512, 54601])
 
         # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
-        log = "1,Ackermann,3,54529,54760,55807,266240"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
+        log = '1,Ackermann,3,54529,54760,55807,266240'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
         self.assertEqual((r.samples.count, r.max_rss), (3, 266240))
         # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs)
-        log = "1,Ackermann,5,54570,54593,54644,57212,58304"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=False)
-        self.assertEqual(
-            (r.num_samples, r.min, r.median, r.max), (5, 54570, 54644, 58304)
-        )
+        log = '1,Ackermann,5,54570,54593,54644,57212,58304'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=False)
+        self.assertEqual((r.num_samples, r.min, r.median, r.max),
+                         (5, 54570, 54644, 58304))
         self.assertEqual((r.samples.q1, r.samples.q3), (54593, 57212))
         self.assertEqual(r.samples.count, 5)
         # #,TEST,SAMPLES,MIN(μs),Q1(μs),Q2(μs),Q3(μs),MAX(μs),MAX_RSS(B)
-        log = "1,Ackermann,5,54686,54731,54774,55030,63466,270336"
-        r = PerformanceTestResult(log.split(","), quantiles=True, memory=True)
+        log = '1,Ackermann,5,54686,54731,54774,55030,63466,270336'
+        r = PerformanceTestResult(log.split(','), quantiles=True, memory=True)
         self.assertEqual(r.samples.num_samples, 5)
         self.assertEqual(r.samples.count, 4)  # outlier was excluded
         self.assertEqual(r.max_rss, 270336)
@@ -248,9 +241,10 @@ def test_init_delta_quantiles(self):
         # #,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX
         # 2-quantile from 2 samples in repeated min, when delta encoded,
         # the difference is 0, which is ommited -- only separator remains
-        log = "202,DropWhileArray,2,265,,22"
-        r = PerformanceTestResult(log.split(","), quantiles=True, delta=True)
-        self.assertEqual((r.num_samples, r.min, r.median, r.max), (2, 265, 265, 287))
+        log = '202,DropWhileArray,2,265,,22'
+        r = PerformanceTestResult(log.split(','), quantiles=True, delta=True)
+        self.assertEqual((r.num_samples, r.min, r.median, r.max),
+                         (2, 265, 265, 287))
         self.assertEqual(r.samples.count, 2)
         self.assertEqual(r.samples.num_samples, 2)
 
@@ -266,17 +260,14 @@ def test_init_oversampled_quantiles(self):
           qs <- subsample(x, s); c(qs[1], diff(qs)) }))
         sapply(c(3, 5, 11, 21), tbl)
         """
-
         def validatePTR(deq):  # construct from delta encoded quantiles string
-            deq = deq.split(",")
-            num_samples = deq.count("1")
-            r = PerformanceTestResult(
-                ["0", "B", str(num_samples)] + deq, quantiles=True, delta=True
-            )
+            deq = deq.split(',')
+            num_samples = deq.count('1')
+            r = PerformanceTestResult(['0', 'B', str(num_samples)] + deq,
+                                      quantiles=True, delta=True)
             self.assertEqual(r.samples.num_samples, num_samples)
-            self.assertEqual(
-                [s.runtime for s in r.samples.all_samples], range(1, num_samples + 1)
-            )
+            self.assertEqual([s.runtime for s in r.samples.all_samples],
+                             range(1, num_samples + 1))
 
         delta_encoded_quantiles = """
 1,,
@@ -315,58 +306,55 @@ def validatePTR(deq):  # construct from delta encoded quantiles string
 1,,1,1,1,1,1,1,1,1,,1,1,1,1,1,1,1,1,1,
 1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"""
-        map(validatePTR, delta_encoded_quantiles.split("\n")[1:])
+        map(validatePTR, delta_encoded_quantiles.split('\n')[1:])
 
     def test_init_meta(self):
         # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),…
         # …PAGES,ICS,YIELD
-        log = "1,Ackermann,200,715,1281,726,47,715,7,29,15"
-        r = PerformanceTestResult(log.split(","), meta=True)
-        self.assertEqual((r.test_num, r.name), ("1", "Ackermann"))
+        log = '1,Ackermann,200,715,1281,726,47,715,7,29,15'
+        r = PerformanceTestResult(log.split(','), meta=True)
+        self.assertEqual((r.test_num, r.name), ('1', 'Ackermann'))
         self.assertEqual(
             (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
-            (200, 715, 1281, 726, 47, 715),
-        )
-        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (7, 29, 15))
+            (200, 715, 1281, 726, 47, 715))
+        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count),
+                         (7, 29, 15))
         # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),MAX_RSS(B),…
         # …PAGES,ICS,YIELD
-        log = "1,Ackermann,200,715,1951,734,97,715,36864,9,50,15"
-        r = PerformanceTestResult(log.split(","), memory=True, meta=True)
+        log = '1,Ackermann,200,715,1951,734,97,715,36864,9,50,15'
+        r = PerformanceTestResult(log.split(','), memory=True, meta=True)
         self.assertEqual(
             (r.num_samples, r.min, r.max, r.mean, r.sd, r.median),
-            (200, 715, 1951, 734, 97, 715),
-        )
+            (200, 715, 1951, 734, 97, 715))
         self.assertEqual(
             (r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
-            (9, 50, 15, 36864),
-        )
+            (9, 50, 15, 36864))
         # #,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD
-        log = "1,Ackermann,200,715,3548,8,31,15"
-        r = PerformanceTestResult(log.split(","), quantiles=True, meta=True)
+        log = '1,Ackermann,200,715,3548,8,31,15'
+        r = PerformanceTestResult(log.split(','), quantiles=True, meta=True)
         self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 3548))
-        self.assertEqual(
-            (r.samples.count, r.samples.min, r.samples.max), (2, 715, 3548)
-        )
-        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 31, 15))
+        self.assertEqual((r.samples.count, r.samples.min, r.samples.max),
+                         (2, 715, 3548))
+        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count),
+                         (8, 31, 15))
         # #,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD
-        log = "1,Ackermann,200,715,1259,32768,8,28,15"
+        log = '1,Ackermann,200,715,1259,32768,8,28,15'
         r = PerformanceTestResult(
-            log.split(","), quantiles=True, memory=True, meta=True
-        )
+            log.split(','), quantiles=True, memory=True, meta=True)
         self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 1259))
-        self.assertEqual(
-            (r.samples.count, r.samples.min, r.samples.max), (2, 715, 1259)
-        )
+        self.assertEqual((r.samples.count, r.samples.min, r.samples.max),
+                         (2, 715, 1259))
         self.assertEquals(r.max_rss, 32768)
-        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count), (8, 28, 15))
+        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count),
+                         (8, 28, 15))
 
     def test_repr(self):
-        log_line = "1,AngryPhonebook,20,10664,12933,11035,576,10884"
-        r = PerformanceTestResult(log_line.split(","))
+        log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'
+        r = PerformanceTestResult(log_line.split(','))
         self.assertEqual(
             str(r),
-            "<PerformanceTestResult name:'AngryPhonebook' samples:20 "
-            "min:10664 max:12933 mean:11035 sd:576 median:10884>",
+            '<PerformanceTestResult name:\'AngryPhonebook\' samples:20 '
+            'min:10664 max:12933 mean:11035 sd:576 median:10884>'
         )
 
     def test_merge(self):
@@ -374,70 +362,51 @@ def test_merge(self):
 1,AngryPhonebook,1,12045,12045,12045,0,12045
 1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
 1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
-1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split(
-            "\n"
-        )[
-            1:
-        ]
-        results = map(PerformanceTestResult, [line.split(",") for line in tests])
+1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:]
+        results = map(PerformanceTestResult,
+                      [line.split(',') for line in tests])
         results[2].setup = 9
         results[3].setup = 7
 
         def as_tuple(r):
-            return (
-                r.num_samples,
-                r.min,
-                r.max,
-                round(r.mean, 2),
-                r.sd,
-                r.median,
-                r.max_rss,
-                r.setup,
-            )
+            return (r.num_samples, r.min, r.max, round(r.mean, 2),
+                    r.sd, r.median, r.max_rss, r.setup)
 
         r = results[0]
-        self.assertEqual(as_tuple(r), (1, 12045, 12045, 12045, 0, 12045, None, None))
+        self.assertEqual(as_tuple(r),
+                         (1, 12045, 12045, 12045, 0, 12045, None, None))
         r.merge(results[1])
-        self.assertEqual(
-            as_tuple(r),  # drops SD and median, +max_rss
-            (2, 12045, 12325, 12185, None, None, 10510336, None),
-        )
+        self.assertEqual(as_tuple(r),  # drops SD and median, +max_rss
+                         (2, 12045, 12325, 12185, None, None, 10510336, None))
         r.merge(results[2])
-        self.assertEqual(
-            as_tuple(r),  # picks smaller of the MAX_RSS, +setup
-            (3, 11616, 12325, 11995.33, None, None, 10502144, 9),
-        )
+        self.assertEqual(as_tuple(r),  # picks smaller of the MAX_RSS, +setup
+                         (3, 11616, 12325, 11995.33, None, None, 10502144, 9))
         r.merge(results[3])
-        self.assertEqual(
-            as_tuple(r),  # picks smaller of the setup values
-            (4, 11616, 12325, 12064, None, None, 10498048, 7),
-        )
+        self.assertEqual(as_tuple(r),  # picks smaller of the setup values
+                         (4, 11616, 12325, 12064, None, None, 10498048, 7))
 
 
 class TestResultComparison(unittest.TestCase):
     def setUp(self):
         self.r0 = PerformanceTestResult(
-            "101,GlobalClass,20,0,0,0,0,0,10185728".split(",")
-        )
+            '101,GlobalClass,20,0,0,0,0,0,10185728'.split(','))
         self.r01 = PerformanceTestResult(
-            "101,GlobalClass,20,20,20,20,0,0,10185728".split(",")
-        )
+            '101,GlobalClass,20,20,20,20,0,0,10185728'.split(','))
         self.r1 = PerformanceTestResult(
-            "1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
-        )
+            '1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336'.split(','))
         self.r2 = PerformanceTestResult(
-            "1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
-        )
+            '1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144'.split(','))
 
     def test_init(self):
         rc = ResultComparison(self.r1, self.r2)
-        self.assertEqual(rc.name, "AngryPhonebook")
+        self.assertEqual(rc.name, 'AngryPhonebook')
         self.assertAlmostEquals(rc.ratio, 12325.0 / 11616.0)
-        self.assertAlmostEquals(rc.delta, (((11616.0 / 12325.0) - 1) * 100), places=3)
+        self.assertAlmostEquals(rc.delta, (((11616.0 / 12325.0) - 1) * 100),
+                                places=3)
         # handle test results that sometimes change to zero, when compiler
         # optimizes out the body of the incorrectly written test
         rc = ResultComparison(self.r0, self.r0)
-        self.assertEqual(rc.name, "GlobalClass")
+        self.assertEqual(rc.name, 'GlobalClass')
         self.assertAlmostEquals(rc.ratio, 1)
         self.assertAlmostEquals(rc.delta, 0, places=3)
         rc = ResultComparison(self.r0, self.r01)
@@ -447,7 +416,10 @@ def test_init(self):
         self.assertAlmostEquals(rc.ratio, 20001)
         self.assertAlmostEquals(rc.delta, -99.995, places=3)
         # disallow comparison of different test results
-        self.assertRaises(AssertionError, ResultComparison, self.r0, self.r1)
+        self.assertRaises(
+            AssertionError,
+            ResultComparison, self.r0, self.r1
+        )
 
     def test_values_is_dubious(self):
         self.assertFalse(ResultComparison(self.r1, self.r2).is_dubious)
@@ -469,7 +441,7 @@ def tearDown(self):
 
     def write_temp_file(self, file_name, data):
         temp_file_name = os.path.join(self.test_dir, file_name)
-        with open(temp_file_name, "w") as f:
+        with open(temp_file_name, 'w') as f:
             f.write(data)
         return temp_file_name
 
@@ -489,25 +461,19 @@ class OldAndNewLog(unittest.TestCase):
 3,Array2D,20,335831,400221,346622,0,346622
 1,AngryPhonebook,20,10458,12714,11000,0,11000"""
 
-    old_results = dict(
-        [
-            (r.name, r)
-            for r in map(
-                PerformanceTestResult,
-                [line.split(",") for line in old_log_content.splitlines()],
-            )
-        ]
-    )
-
-    new_results = dict(
-        [
-            (r.name, r)
-            for r in map(
-                PerformanceTestResult,
-                [line.split(",") for line in new_log_content.splitlines()],
-            )
-        ]
-    )
+    old_results = dict([(r.name, r)
+                        for r in
+                        map(PerformanceTestResult,
+                            [line.split(',')
+                             for line in
+                             old_log_content.splitlines()])])
+
+    new_results = dict([(r.name, r)
+                        for r in
+                        map(PerformanceTestResult,
+                            [line.split(',')
+                             for line in
+                             new_log_content.splitlines()])])
 
     def assert_report_contains(self, texts, report):
         assert not isinstance(texts, str)
@@ -528,108 +494,95 @@ def test_parse_results_csv(self):
         parser = LogParser()
         results = parser.parse_results(log.splitlines())
         self.assertTrue(isinstance(results[0], PerformanceTestResult))
-        self.assertEquals(results[0].name, "Array.append.Array.Int?")
-        self.assertEquals(results[1].name, "Bridging.NSArray.as!.Array.NSString")
-        self.assertEquals(results[2].name, "Flatten.Array.Tuple4.lazy.for-in.Reserve")
+        self.assertEquals(results[0].name, 'Array.append.Array.Int?')
+        self.assertEquals(results[1].name,
+                          'Bridging.NSArray.as!.Array.NSString')
+        self.assertEquals(results[2].name,
+                          'Flatten.Array.Tuple4.lazy.for-in.Reserve')
 
     def test_parse_results_tab_delimited(self):
-        log = "34\tBitCount\t20\t3\t4\t4\t0\t4"
+        log = '34\tBitCount\t20\t3\t4\t4\t0\t4'
         parser = LogParser()
         results = parser.parse_results(log.splitlines())
         self.assertTrue(isinstance(results[0], PerformanceTestResult))
-        self.assertEqual(results[0].name, "BitCount")
+        self.assertEqual(results[0].name, 'BitCount')
 
     def test_parse_results_formatted_text(self):
         """Parse format that Benchmark_Driver prints to console"""
-        log = """
+        log = ("""
   # TEST      SAMPLES MIN(μs) MAX(μs) MEAN(μs) SD(μs) MEDIAN(μs) MAX_RSS(B)
   3 Array2D        20    2060    2188     2099      0       2099   20915200
 
 Total performance tests executed: 1
-"""
+""")
         parser = LogParser()
         results = parser.parse_results(log.splitlines()[1:])  # without 1st \n
         self.assertTrue(isinstance(results[0], PerformanceTestResult))
         r = results[0]
-        self.assertEqual(r.name, "Array2D")
+        self.assertEqual(r.name, 'Array2D')
         self.assertEqual(r.max_rss, 20915200)
 
     def test_parse_quantiles(self):
         """Gathers samples from reported quantiles. Handles optional memory."""
         r = LogParser.results_from_string(
             """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
-1,Ackermann,3,54383,54512,54601"""
-        )["Ackermann"]
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54383, 54512, 54601]
-        )
+1,Ackermann,3,54383,54512,54601""")['Ackermann']
+        self.assertEqual([s.runtime for s in r.samples.all_samples],
+                         [54383, 54512, 54601])
         r = LogParser.results_from_string(
             """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
-1,Ackermann,3,54529,54760,55807,266240"""
-        )["Ackermann"]
-        self.assertEqual(
-            [s.runtime for s in r.samples.all_samples], [54529, 54760, 55807]
-        )
+1,Ackermann,3,54529,54760,55807,266240""")['Ackermann']
+        self.assertEqual([s.runtime for s in r.samples.all_samples],
+                         [54529, 54760, 55807])
         self.assertEqual(r.max_rss, 266240)
 
     def test_parse_delta_quantiles(self):
         r = LogParser.results_from_string(  # 2-quantile aka. median
-            "#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,"
-        )["B"]
+            '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,1,101,,')['B']
         self.assertEqual(
             (r.num_samples, r.min, r.median, r.max, r.samples.count),
-            (1, 101, 101, 101, 1),
-        )
+            (1, 101, 101, 101, 1))
         r = LogParser.results_from_string(
-            "#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1"
-        )["B"]
+            '#,TEST,SAMPLES,MIN(μs),𝚫MEDIAN,𝚫MAX\n0,B,2,101,,1')['B']
         self.assertEqual(
             (r.num_samples, r.min, r.median, r.max, r.samples.count),
-            (2, 101, 101, 102, 2),
-        )
+            (2, 101, 101, 102, 2))
         r = LogParser.results_from_string(  # 20-quantiles aka. ventiles
-            "#,TEST,SAMPLES,MIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,"
-            + "𝚫V9,𝚫VA,𝚫VB,𝚫VC,𝚫VD,𝚫VE,𝚫VF,𝚫VG,𝚫VH,𝚫VI,𝚫VJ,𝚫MAX\n"
-            + "202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464"
-        )["DropWhileArray"]
+            '#,TEST,SAMPLES,MIN(μs),𝚫V1,𝚫V2,𝚫V3,𝚫V4,𝚫V5,𝚫V6,𝚫V7,𝚫V8,' +
+            '𝚫V9,𝚫VA,𝚫VB,𝚫VC,𝚫VD,𝚫VE,𝚫VF,𝚫VG,𝚫VH,𝚫VI,𝚫VJ,𝚫MAX\n' +
+            '202,DropWhileArray,200,214,,,,,,,,,,,,1,,,,,,2,16,464'
+        )['DropWhileArray']
         self.assertEqual(
             (r.num_samples, r.min, r.max, r.samples.count),
             # last 3 ventiles were outliers and were excluded from the sample
-            (200, 214, 215, 18),
-        )
+            (200, 214, 215, 18))
 
     def test_parse_meta(self):
         r = LogParser.results_from_string(
-            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
-            + "PAGES,ICS,YIELD\n"
-            + "0,B,1,2,2,2,0,2,7,29,15"
-        )["B"]
+            '#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' +
+            'PAGES,ICS,YIELD\n' +
+            '0,B,1,2,2,2,0,2,7,29,15')['B']
         self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (2, 7, 29, 15)
-        )
+            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count),
+            (2, 7, 29, 15))
         r = LogParser.results_from_string(
-            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),"
-            + "MAX_RSS(B),PAGES,ICS,YIELD\n"
-            + "0,B,1,3,3,3,0,3,36864,9,50,15"
-        )["B"]
+            '#,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),' +
+            'MAX_RSS(B),PAGES,ICS,YIELD\n' +
+            '0,B,1,3,3,3,0,3,36864,9,50,15')['B']
         self.assertEqual(
             (r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
-            (3, 9, 50, 15, 36864),
-        )
+            (3, 9, 50, 15, 36864))
         r = LogParser.results_from_string(
-            "#,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD\n" + "0,B,1,4,4,8,31,15"
-        )["B"]
-        self.assertEqual(
-            (r.min, r.mem_pages, r.involuntary_cs, r.yield_count), (4, 8, 31, 15)
-        )
+            '#,TEST,SAMPLES,MIN(μs),MAX(μs),PAGES,ICS,YIELD\n' +
+            '0,B,1,4,4,8,31,15')['B']
+        self.assertEqual((r.min, r.mem_pages, r.involuntary_cs, r.yield_count),
+                         (4, 8, 31, 15))
         r = LogParser.results_from_string(
-            "#,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n"
-            + "0,B,1,5,5,32768,8,28,15"
-        )["B"]
+            '#,TEST,SAMPLES,MIN(μs),MAX(μs),MAX_RSS(B),PAGES,ICS,YIELD\n' +
+            '0,B,1,5,5,32768,8,28,15')['B']
         self.assertEqual(
             (r.min, r.mem_pages, r.involuntary_cs, r.yield_count, r.max_rss),
-            (5, 8, 28, 15, 32768),
-        )
+            (5, 8, 28, 15, 32768))
 
     def test_parse_results_verbose(self):
         """Parse multiple performance test results with 2 sample formats:
@@ -655,31 +608,27 @@ def test_parse_results_verbose(self):
 
 Totals,2"""
         parser = LogParser()
-        results = parser.parse_results(verbose_log.split("\n"))
+        results = parser.parse_results(verbose_log.split('\n'))
 
         r = results[0]
         self.assertEqual(
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
-            ("AngryPhonebook", 11467, 13898, 12392, 1315, 11812),
+            ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
         )
         self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(
-            results[0].samples.all_samples,
-            [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)],
-        )
+        self.assertEqual(results[0].samples.all_samples,
+                         [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)])
         self.assertEqual(r.yields, None)
 
         r = results[1]
         self.assertEqual(
             (r.name, r.min, r.max, int(r.mean), int(r.sd), r.median),
-            ("Array2D", 369900, 381039, 373994, 6127, 371043),
+            ('Array2D', 369900, 381039, 373994, 6127, 371043)
         )
         self.assertEqual(r.setup, 14444)
         self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(
-            results[1].samples.all_samples,
-            [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)],
-        )
+        self.assertEqual(results[1].samples.all_samples,
+                         [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
         yielded = r.yields[0]
         self.assertEqual(yielded.before_sample, 1)
         self.assertEqual(yielded.after, 369918)
@@ -693,7 +642,7 @@ def test_parse_environment_verbose(self):
 2,AngryPhonebook,3,11269,11884,11657,338,11820
 """
         parser = LogParser()
-        results = parser.parse_results(verbose_log.split("\n"))
+        results = parser.parse_results(verbose_log.split('\n'))
 
         r = results[0]
         self.assertEqual(r.max_rss, 32768)
@@ -706,8 +655,8 @@ def test_results_from_merge(self):
         concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
 4,ArrayAppend,1,20000,20000,20000,0,20000"""
         results = LogParser.results_from_string(concatenated_logs)
-        self.assertEqual(results.keys(), ["ArrayAppend"])
-        result = results["ArrayAppend"]
+        self.assertEqual(results.keys(), ['ArrayAppend'])
+        result = results['ArrayAppend']
         self.assertTrue(isinstance(result, PerformanceTestResult))
         self.assertEqual(result.min, 20000)
         self.assertEqual(result.max, 29000)
@@ -728,8 +677,8 @@ def test_results_from_merge_verbose(self):
     Sample 3,364245
 3,Array2D,4,363094,376131,368159,5931,369169"""
         results = LogParser.results_from_string(concatenated_logs)
-        self.assertEqual(results.keys(), ["Array2D"])
-        result = results["Array2D"]
+        self.assertEqual(results.keys(), ['Array2D'])
+        result = results['Array2D']
         self.assertTrue(isinstance(result, PerformanceTestResult))
         self.assertEqual(result.min, 350815)
         self.assertEqual(result.max, 376131)
@@ -766,7 +715,7 @@ def test_excludes_outliers_from_samples(self):
 65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
 """
         parser = LogParser()
-        result = parser.parse_results(verbose_log.split("\n"))[0]
+        result = parser.parse_results(verbose_log.split('\n'))[0]
         self.assertEqual(result.num_samples, 10)
         self.assertEqual(result.samples.count, 8)
         self.assertEqual(len(result.samples.outliers), 2)
@@ -778,26 +727,26 @@ def names(tests):
             return [t.name for t in tests]
 
         tc = TestComparator(self.old_results, self.new_results, 0.05)
-        self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"])
-        self.assertEqual(names(tc.increased), ["ByteSwap", "ArrayAppend"])
-        self.assertEqual(names(tc.decreased), ["BitCount"])
-        self.assertEqual(names(tc.added), ["TwoSum"])
-        self.assertEqual(names(tc.removed), ["AnyHashableWithAClass"])
+        self.assertEqual(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
+        self.assertEqual(names(tc.increased), ['ByteSwap', 'ArrayAppend'])
+        self.assertEqual(names(tc.decreased), ['BitCount'])
+        self.assertEqual(names(tc.added), ['TwoSum'])
+        self.assertEqual(names(tc.removed), ['AnyHashableWithAClass'])
         # other way around
         tc = TestComparator(self.new_results, self.old_results, 0.05)
-        self.assertEqual(names(tc.unchanged), ["AngryPhonebook", "Array2D"])
-        self.assertEqual(names(tc.increased), ["BitCount"])
-        self.assertEqual(names(tc.decreased), ["ByteSwap", "ArrayAppend"])
-        self.assertEqual(names(tc.added), ["AnyHashableWithAClass"])
-        self.assertEqual(names(tc.removed), ["TwoSum"])
+        self.assertEqual(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
+        self.assertEqual(names(tc.increased), ['BitCount'])
+        self.assertEqual(names(tc.decreased), ['ByteSwap', 'ArrayAppend'])
+        self.assertEqual(names(tc.added), ['AnyHashableWithAClass'])
+        self.assertEqual(names(tc.removed), ['TwoSum'])
         # delta_threshold determines the sorting into change groups;
         # report only change above 100% (ByteSwap's runtime went to 0):
         tc = TestComparator(self.old_results, self.new_results, 1)
         self.assertEqual(
             names(tc.unchanged),
-            ["AngryPhonebook", "Array2D", "ArrayAppend", "BitCount"],
+            ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount']
         )
-        self.assertEqual(names(tc.increased), ["ByteSwap"])
+        self.assertEqual(names(tc.increased), ['ByteSwap'])
         self.assertEqual(tc.decreased, [])
 
 
@@ -821,58 +770,45 @@ def assert_html_contains(self, texts):
 
     def test_values(self):
         self.assertEqual(
-            ReportFormatter.values(
-                PerformanceTestResult(
-                    "1,AngryPhonebook,20,10664,12933,11035,576,10884".split(",")
-                )
-            ),
-            ("AngryPhonebook", "10664", "12933", "11035", "—"),
+            ReportFormatter.values(PerformanceTestResult(
+                '1,AngryPhonebook,20,10664,12933,11035,576,10884'.split(','))),
+            ('AngryPhonebook', '10664', '12933', '11035', '—')
         )
         self.assertEqual(
-            ReportFormatter.values(
-                PerformanceTestResult(
-                    "1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336".split(",")
-                )
-            ),
-            ("AngryPhonebook", "12045", "12045", "12045", "10510336"),
+            ReportFormatter.values(PerformanceTestResult(
+                '1,AngryPhonebook,1,12045,12045,12045,0,12045,10510336'
+                .split(','))),
+            ('AngryPhonebook', '12045', '12045', '12045', '10510336')
         )
 
         r1 = PerformanceTestResult(
-            "1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336".split(",")
-        )
+            '1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336'.split(','))
         r2 = PerformanceTestResult(
-            "1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144".split(",")
-        )
+            '1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144'.split(','))
         self.assertEqual(
             ReportFormatter.values(ResultComparison(r1, r2)),
-            ("AngryPhonebook", "12325", "11616", "-5.8%", "1.06x"),
+            ('AngryPhonebook', '12325', '11616', '-5.8%', '1.06x')
         )
         self.assertEqual(
             ReportFormatter.values(ResultComparison(r2, r1)),
-            ("AngryPhonebook", "11616", "12325", "+6.1%", "0.94x"),
+            ('AngryPhonebook', '11616', '12325', '+6.1%', '0.94x')
         )
         r2.max = r1.min + 1
         self.assertEqual(
             ReportFormatter.values(ResultComparison(r1, r2))[4],
-            "1.06x (?)",  # is_dubious
+            '1.06x (?)'  # is_dubious
         )
 
     def test_justified_columns(self):
         """Table columns are all formated with same width, defined by the
         longest value.
         """
-        self.assert_markdown_contains(
-            [
-                "AnyHashableWithAClass | 247027 | 319065 | 259056  | 10250445",
-                "Array2D               | 335831 | 335831 | +0.0%   | 1.00x",
-            ]
-        )
-        self.assert_git_contains(
-            [
-                "AnyHashableWithAClass   247027   319065   259056    10250445",
-                "Array2D                 335831   335831   +0.0%     1.00x",
-            ]
-        )
+        self.assert_markdown_contains([
+            'AnyHashableWithAClass | 247027 | 319065 | 259056  | 10250445',
+            'Array2D               | 335831 | 335831 | +0.0%   | 1.00x'])
+        self.assert_git_contains([
+            'AnyHashableWithAClass   247027   319065   259056    10250445',
+            'Array2D                 335831   335831   +0.0%     1.00x'])
 
     def test_column_headers(self):
         """Report contains table headers for ResultComparisons and changed
@@ -881,63 +817,49 @@ def test_column_headers(self):
         performance_test_result = self.tc.added[0]
         self.assertEqual(
             ReportFormatter.header_for(performance_test_result),
-            ("TEST", "MIN", "MAX", "MEAN", "MAX_RSS"),
+            ('TEST', 'MIN', 'MAX', 'MEAN', 'MAX_RSS')
         )
         comparison_result = self.tc.increased[0]
         self.assertEqual(
             ReportFormatter.header_for(comparison_result),
-            ("TEST", "OLD", "NEW", "DELTA", "RATIO"),
-        )
-        self.assert_markdown_contains(
-            [
-                "TEST                  | OLD    | NEW    | DELTA   | RATIO",
-                ":---                  | ---:   | ---:   | ---:    | ---:   ",
-                "TEST                  | MIN    | MAX    | MEAN    | MAX_RSS",
-            ]
-        )
-        self.assert_git_contains(
-            [
-                "TEST                    OLD      NEW      DELTA     RATIO",
-                "TEST                    MIN      MAX      MEAN      MAX_RSS",
-            ]
-        )
-        self.assert_html_contains(
-            [
-                """
+            ('TEST', 'OLD', 'NEW', 'DELTA', 'RATIO')
+        )
+        self.assert_markdown_contains([
+            'TEST                  | OLD    | NEW    | DELTA   | RATIO',
+            ':---                  | ---:   | ---:   | ---:    | ---:   ',
+            'TEST                  | MIN    | MAX    | MEAN    | MAX_RSS'])
+        self.assert_git_contains([
+            'TEST                    OLD      NEW      DELTA     RATIO',
+            'TEST                    MIN      MAX      MEAN      MAX_RSS'])
+        self.assert_html_contains([
+            """
                 <th align='left'>OLD</th>
                 <th align='left'>NEW</th>
                 <th align='left'>DELTA</th>
                 <th align='left'>RATIO</th>""",
-                """
+            """
                 <th align='left'>MIN</th>
                 <th align='left'>MAX</th>
                 <th align='left'>MEAN</th>
-                <th align='left'>MAX_RSS</th>""",
-            ]
-        )
+                <th align='left'>MAX_RSS</th>"""])
 
     def test_emphasize_speedup(self):
         """Emphasize speedup values for regressions and improvements"""
         # tests in No Changes don't have emphasized speedup
-        self.assert_markdown_contains(
-            [
-                "BitCount              | 3      | 9      | +199.9% | **0.33x**",
-                "ByteSwap              | 4      | 0      | -100.0% | **4001.00x**",
-                "AngryPhonebook        | 10458  | 10458  | +0.0%   | 1.00x ",
-                "ArrayAppend           | 23641  | 20000  | -15.4%  | **1.18x (?)**",
-            ]
-        )
-        self.assert_git_contains(
-            [
-                "BitCount                3        9        +199.9%   **0.33x**",
-                "ByteSwap                4        0        -100.0%   **4001.00x**",
-                "AngryPhonebook          10458    10458    +0.0%     1.00x",
-                "ArrayAppend             23641    20000    -15.4%    **1.18x (?)**",
-            ]
-        )
-        self.assert_html_contains(
-            [
-                """
+        self.assert_markdown_contains([
+            'BitCount              | 3      | 9      | +199.9% | **0.33x**',
+            'ByteSwap              | 4      | 0      | -100.0% | **4001.00x**',
+            'AngryPhonebook        | 10458  | 10458  | +0.0%   | 1.00x ',
+            'ArrayAppend           | 23641  | 20000  | -15.4%  | **1.18x (?)**'
+        ])
+        self.assert_git_contains([
+            'BitCount                3        9        +199.9%   **0.33x**',
+            'ByteSwap                4        0        -100.0%   **4001.00x**',
+            'AngryPhonebook          10458    10458    +0.0%     1.00x',
+            'ArrayAppend             23641    20000    -15.4%    **1.18x (?)**'
+        ])
+        self.assert_html_contains([
+            """
         <tr>
                 <td align='left'>BitCount</td>
                 <td align='left'>3</td>
@@ -945,7 +867,7 @@ def test_emphasize_speedup(self):
                 <td align='left'>+199.9%</td>
                 <td align='left'><font color='red'>0.33x</font></td>
         </tr>""",
-                """
+            """
         <tr>
                 <td align='left'>ByteSwap</td>
                 <td align='left'>4</td>
@@ -953,221 +875,182 @@ def test_emphasize_speedup(self):
                 <td align='left'>-100.0%</td>
                 <td align='left'><font color='green'>4001.00x</font></td>
         </tr>""",
-                """
+            """
         <tr>
                 <td align='left'>AngryPhonebook</td>
                 <td align='left'>10458</td>
                 <td align='left'>10458</td>
                 <td align='left'>+0.0%</td>
                 <td align='left'><font color='black'>1.00x</font></td>
-        </tr>""",
-            ]
-        )
+        </tr>"""
+        ])
 
     def test_sections(self):
         """Report is divided into sections with summaries."""
-        self.assert_markdown_contains(
-            [
-                """<details open>
+        self.assert_markdown_contains([
+            """<details open>
   <summary>Regression (1)</summary>""",
-                """<details >
+            """<details >
   <summary>Improvement (2)</summary>""",
-                """<details >
+            """<details >
   <summary>No Changes (2)</summary>""",
-                """<details open>
+            """<details open>
   <summary>Added (1)</summary>""",
-                """<details open>
-  <summary>Removed (1)</summary>""",
-            ]
-        )
-        self.assert_git_contains(
-            [
-                "Regression (1): \n",
-                "Improvement (2): \n",
-                "No Changes (2): \n",
-                "Added (1): \n",
-                "Removed (1): \n",
-            ]
-        )
-        self.assert_html_contains(
-            [
-                "<th align='left'>Regression (1)</th>",
-                "<th align='left'>Improvement (2)</th>",
-                "<th align='left'>No Changes (2)</th>",
-                "<th align='left'>Added (1)</th>",
-                "<th align='left'>Removed (1)</th>",
-            ]
-        )
+            """<details open>
+  <summary>Removed (1)</summary>"""])
+        self.assert_git_contains([
+            'Regression (1): \n',
+            'Improvement (2): \n',
+            'No Changes (2): \n',
+            'Added (1): \n',
+            'Removed (1): \n'])
+        self.assert_html_contains([
+            "<th align='left'>Regression (1)</th>",
+            "<th align='left'>Improvement (2)</th>",
+            "<th align='left'>No Changes (2)</th>",
+            "<th align='left'>Added (1)</th>",
+            "<th align='left'>Removed (1)</th>"])
 
     def test_report_only_changes(self):
         """Leave out tests without significant change."""
         rf = ReportFormatter(self.tc, changes_only=True)
         markdown, git, html = rf.markdown(), rf.git(), rf.html()
-        self.assertNotIn("No Changes", markdown)
-        self.assertNotIn("AngryPhonebook", markdown)
-        self.assertNotIn("No Changes", git)
-        self.assertNotIn("AngryPhonebook", git)
-        self.assertNotIn("No Changes", html)
-        self.assertNotIn("AngryPhonebook", html)
+        self.assertNotIn('No Changes', markdown)
+        self.assertNotIn('AngryPhonebook', markdown)
+        self.assertNotIn('No Changes', git)
+        self.assertNotIn('AngryPhonebook', git)
+        self.assertNotIn('No Changes', html)
+        self.assertNotIn('AngryPhonebook', html)
 
     def test_single_table_report(self):
         """Single table report has inline headers and no elaborate sections."""
         self.tc.removed = []  # test handling empty section
         rf = ReportFormatter(self.tc, changes_only=True, single_table=True)
         markdown = rf.markdown()
-        self.assertNotIn("<details", markdown)  # no sections
-        self.assertNotIn("\n\n", markdown)  # table must not be broken
-        self.assertNotIn("Removed", markdown)
-        self.assert_report_contains(
-            [
-                "\n**Regression** ",
-                "| **OLD**",
-                "| **NEW**",
-                "| **DELTA**",
-                "| **RATIO**",
-                "\n**Added** ",
-                "| **MIN**",
-                "| **MAX**",
-                "| **MEAN**",
-                "| **MAX_RSS**",
-            ],
-            markdown,
-        )
+        self.assertNotIn('<details', markdown)  # no sections
+        self.assertNotIn('\n\n', markdown)  # table must not be broken
+        self.assertNotIn('Removed', markdown)
+        self.assert_report_contains([
+            '\n**Regression** ',
+            '| **OLD**', '| **NEW**', '| **DELTA**', '| **RATIO**',
+            '\n**Added** ',
+            '| **MIN**', '| **MAX**', '| **MEAN**', '| **MAX_RSS**'
+        ], markdown)
         # Single delimiter row:
-        self.assertIn("\n:---", markdown)  # first column is left aligned
-        self.assertEqual(markdown.count("| ---:"), 4)  # other, right aligned
+        self.assertIn('\n:---', markdown)  # first column is left aligned
+        self.assertEqual(markdown.count('| ---:'), 4)  # other, right aligned
         # Separator before every inline header (new section):
-        self.assertEqual(markdown.count("&nbsp; | | | | "), 2)
+        self.assertEqual(markdown.count('&nbsp; | | | | '), 2)
 
         git = rf.git()
-        self.assertNotIn("): \n", git)  # no sections
-        self.assertNotIn("REMOVED", git)
-        self.assert_report_contains(
-            [
-                "\nREGRESSION ",
-                " OLD ",
-                " NEW ",
-                " DELTA ",
-                " RATIO ",
-                "\n\nADDED ",
-                " MIN ",
-                " MAX ",
-                " MEAN ",
-                " MAX_RSS ",
-            ],
-            git,
-        )
+        self.assertNotIn('): \n', git)  # no sections
+        self.assertNotIn('REMOVED', git)
+        self.assert_report_contains([
+            '\nREGRESSION ', ' OLD ', ' NEW ', ' DELTA ', ' RATIO ',
+            '\n\nADDED ', ' MIN ', ' MAX ', ' MEAN ', ' MAX_RSS '
+        ], git)
         # Separator before every inline header (new section):
-        self.assertEqual(git.count("\n\n"), 2)
+        self.assertEqual(git.count('\n\n'), 2)
 
 
 class Test_parse_args(unittest.TestCase):
-    required = ["--old-file", "old.log", "--new-file", "new.log"]
+    required = ['--old-file', 'old.log', '--new-file', 'new.log']
 
     def test_required_input_arguments(self):
         with captured_output() as (_, err):
             self.assertRaises(SystemExit, parse_args, [])
-        self.assertIn("usage: compare_perf_tests.py", err.getvalue())
+        self.assertIn('usage: compare_perf_tests.py', err.getvalue())
 
         args = parse_args(self.required)
-        self.assertEqual(args.old_file, "old.log")
-        self.assertEqual(args.new_file, "new.log")
+        self.assertEqual(args.old_file, 'old.log')
+        self.assertEqual(args.new_file, 'new.log')
 
     def test_format_argument(self):
-        self.assertEqual(parse_args(self.required).format, "markdown")
+        self.assertEqual(parse_args(self.required).format, 'markdown')
         self.assertEqual(
-            parse_args(self.required + ["--format", "markdown"]).format, "markdown"
-        )
-        self.assertEqual(parse_args(self.required + ["--format", "git"]).format, "git")
+            parse_args(self.required + ['--format', 'markdown']).format,
+            'markdown')
         self.assertEqual(
-            parse_args(self.required + ["--format", "html"]).format, "html"
-        )
+            parse_args(self.required + ['--format', 'git']).format, 'git')
+        self.assertEqual(
+            parse_args(self.required + ['--format', 'html']).format, 'html')
 
         with captured_output() as (_, err):
-            self.assertRaises(
-                SystemExit, parse_args, self.required + ["--format", "bogus"]
-            )
-        self.assertIn(
-            "error: argument --format: invalid choice: 'bogus' "
-            "(choose from 'markdown', 'git', 'html')",
-            err.getvalue(),
-        )
+            self.assertRaises(SystemExit, parse_args,
+                              self.required + ['--format', 'bogus'])
+        self.assertIn("error: argument --format: invalid choice: 'bogus' "
+                      "(choose from 'markdown', 'git', 'html')",
+                      err.getvalue())
 
     def test_delta_threshold_argument(self):
         # default value
         args = parse_args(self.required)
         self.assertEqual(args.delta_threshold, 0.05)
         # float parsing
-        args = parse_args(self.required + ["--delta-threshold", "0.1"])
+        args = parse_args(self.required + ['--delta-threshold', '0.1'])
         self.assertEqual(args.delta_threshold, 0.1)
-        args = parse_args(self.required + ["--delta-threshold", "1"])
+        args = parse_args(self.required + ['--delta-threshold', '1'])
         self.assertEqual(args.delta_threshold, 1.0)
-        args = parse_args(self.required + ["--delta-threshold", ".2"])
+        args = parse_args(self.required + ['--delta-threshold', '.2'])
         self.assertEqual(args.delta_threshold, 0.2)
 
         with captured_output() as (_, err):
-            self.assertRaises(
-                SystemExit, parse_args, self.required + ["--delta-threshold", "2,2"]
-            )
-        self.assertIn(
-            " error: argument --delta-threshold: invalid float " "value: '2,2'",
-            err.getvalue(),
-        )
+            self.assertRaises(SystemExit, parse_args,
+                              self.required + ['--delta-threshold', '2,2'])
+        self.assertIn(" error: argument --delta-threshold: invalid float "
+                      "value: '2,2'",
+                      err.getvalue())
 
     def test_output_argument(self):
         self.assertEqual(parse_args(self.required).output, None)
-        self.assertEqual(
-            parse_args(self.required + ["--output", "report.log"]).output, "report.log"
-        )
+        self.assertEqual(parse_args(self.required +
+                                    ['--output', 'report.log']).output,
+                         'report.log')
 
     def test_changes_only_argument(self):
         self.assertFalse(parse_args(self.required).changes_only)
-        self.assertTrue(parse_args(self.required + ["--changes-only"]).changes_only)
+        self.assertTrue(parse_args(self.required +
+                                   ['--changes-only']).changes_only)
 
 
 class Test_compare_perf_tests_main(OldAndNewLog, FileSystemIntegration):
     """Integration test that invokes the whole comparison script."""
-
     markdown = [
-        "<summary>Regression (1)</summary>",
-        "TEST                  | OLD    | NEW    | DELTA   | RATIO",
-        "BitCount              | 3      | 9      | +199.9% | **0.33x**",
+        '<summary>Regression (1)</summary>',
+        'TEST                  | OLD    | NEW    | DELTA   | RATIO',
+        'BitCount              | 3      | 9      | +199.9% | **0.33x**',
     ]
     git = [
-        "Regression (1):",
-        "TEST                    OLD      NEW      DELTA     RATIO",
-        "BitCount                3        9        +199.9%   **0.33x**",
+        'Regression (1):',
+        'TEST                    OLD      NEW      DELTA     RATIO',
+        'BitCount                3        9        +199.9%   **0.33x**',
     ]
-    html = ["<html>", "<td align='left'>BitCount</td>"]
+    html = ['<html>', "<td align='left'>BitCount</td>"]
 
     def setUp(self):
         super(Test_compare_perf_tests_main, self).setUp()
-        self.old_log = self.write_temp_file("old.log", self.old_log_content)
-        self.new_log = self.write_temp_file("new.log", self.new_log_content)
+        self.old_log = self.write_temp_file('old.log', self.old_log_content)
+        self.new_log = self.write_temp_file('new.log', self.new_log_content)
 
     def execute_main_with_format(self, report_format, test_output=False):
-        report_file = self.test_dir + "report.log"
-        args = [
-            "compare_perf_tests.py",
-            "--old-file",
-            self.old_log,
-            "--new-file",
-            self.new_log,
-            "--format",
-            report_format,
-        ]
-
-        sys.argv = args if not test_output else args + ["--output", report_file]
+        report_file = self.test_dir + 'report.log'
+        args = ['compare_perf_tests.py',
+                '--old-file', self.old_log,
+                '--new-file', self.new_log,
+                '--format', report_format]
+
+        sys.argv = (args if not test_output else
+                    args + ['--output', report_file])
 
         with captured_output() as (out, _):
             main()
         report_out = out.getvalue()
 
         if test_output:
-            with open(report_file, "r") as f:
+            with open(report_file, 'r') as f:
                 report = f.read()
             # because print adds newline, add one here, too:
-            report_file = str(report + "\n")
+            report_file = str(report + '\n')
         else:
             report_file = None
 
@@ -1175,41 +1058,40 @@ def execute_main_with_format(self, report_format, test_output=False):
 
     def test_markdown(self):
         """Writes Markdown formatted report to stdout"""
-        report_out, _ = self.execute_main_with_format("markdown")
+        report_out, _ = self.execute_main_with_format('markdown')
         self.assert_report_contains(self.markdown, report_out)
 
     def test_markdown_output(self):
         """Writes Markdown formatted report to stdout and `--output` file."""
-        report_out, report_file = self.execute_main_with_format(
-            "markdown", test_output=True
-        )
+        report_out, report_file = (
+            self.execute_main_with_format('markdown', test_output=True))
         self.assertEqual(report_out, report_file)
         self.assert_report_contains(self.markdown, report_file)
 
     def test_git(self):
         """Writes Git formatted report to stdout."""
-        report_out, _ = self.execute_main_with_format("git")
+        report_out, _ = self.execute_main_with_format('git')
         self.assert_report_contains(self.git, report_out)
 
     def test_git_output(self):
         """Writes Git formatted report to stdout and `--output` file."""
-        report_out, report_file = self.execute_main_with_format("git", test_output=True)
+        report_out, report_file = (
+            self.execute_main_with_format('git', test_output=True))
         self.assertEqual(report_out, report_file)
         self.assert_report_contains(self.git, report_file)
 
     def test_html(self):
         """Writes HTML formatted report to stdout."""
-        report_out, _ = self.execute_main_with_format("html")
+        report_out, _ = self.execute_main_with_format('html')
         self.assert_report_contains(self.html, report_out)
 
     def test_html_output(self):
         """Writes HTML formatted report to stdout and `--output` file."""
-        report_out, report_file = self.execute_main_with_format(
-            "html", test_output=True
-        )
+        report_out, report_file = (
+            self.execute_main_with_format('html', test_output=True))
         self.assertEqual(report_out, report_file)
         self.assert_report_contains(self.html, report_file)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py
index 4b675d9d82582..6a2bf8856a99f 100644
--- a/benchmark/scripts/test_utils.py
+++ b/benchmark/scripts/test_utils.py
@@ -78,15 +78,14 @@ def expect(self, call_args, response):
     def assert_called_with(self, expected_args):
         """Verify that the tested method was called with provided arguments."""
         expected_args = tuple(expected_args)
-        assert expected_args in self.calls, "Expected: {0} in Called: {1}".format(
-            expected_args, self.calls
-        )
+        assert expected_args in self.calls, (
+            'Expected: {0} in Called: {1}'.format(expected_args, self.calls))
 
     def assert_called_all_expected(self):
         """Verify that all expeced invocations of tested method were called."""
-        assert self.calls == self.expected, "\nExpected: {0}, \n  Called: {1}".format(
-            self.expected, self.calls
-        )
+        assert self.calls == self.expected, (
+            '\nExpected: {0}, \n  Called: {1}'.format(
+                self.expected, self.calls))
 
 
 class MockLoggingHandler(logging.Handler):
@@ -104,9 +103,5 @@ def emit(self, record):
     def reset(self):
         """Clear all log messages."""
         self.messages = {
-            "debug": [],
-            "info": [],
-            "warning": [],
-            "error": [],
-            "critical": [],
+            'debug': [], 'info': [], 'warning': [], 'error': [], 'critical': []
         }
diff --git a/benchmark/utils/convertToJSON.py b/benchmark/utils/convertToJSON.py
index b7a547116d7ba..54aedc7270068 100644
--- a/benchmark/utils/convertToJSON.py
+++ b/benchmark/utils/convertToJSON.py
@@ -74,9 +74,9 @@
 
 if __name__ == "__main__":
     data = {}
-    data["Tests"] = []
-    data["Machine"] = {}
-    data["Run"] = {}
+    data['Tests'] = []
+    data['Machine'] = {}
+    data['Run'] = {}
     for line in sys.stdin:
         m = SCORERE.match(line)
         if not m:
@@ -84,8 +84,8 @@
             if not m:
                 continue
         test = {}
-        test["Data"] = [int(m.group(VALGROUP))]
-        test["Info"] = {}
-        test["Name"] = [m.group(KEYGROUP)]
-        data["Tests"].append(test)
+        test['Data'] = [int(m.group(VALGROUP))]
+        test['Info'] = {}
+        test['Name'] = [m.group(KEYGROUP)]
+        data['Tests'].append(test)
     print(json.dumps(data, sort_keys=True, indent=4))

From ac294f39867559ea8c1373d5451e01595d907a83 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 29 Jul 2019 14:04:25 +0200
Subject: [PATCH 02/26] [benchmark] Fix parsing delta zeroed metadata

---
 benchmark/scripts/compare_perf_tests.py      | 5 +++--
 benchmark/scripts/test_compare_perf_tests.py | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 017ba24c10229..4efc16804f5da 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -276,8 +276,9 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False,
 
         # Optional measurement metadata. The number of:
         # memory pages used, involuntary context switches and voluntary yields
-        self.mem_pages, self.involuntary_cs, self.yield_count = \
-            [int(x) for x in csv_row[-3:]] if meta else (None, None, None)
+        self.mem_pages, self.involuntary_cs, self.yield_count = (
+            [int(x) if x else 0 for x in csv_row[-3:]] if meta else
+            (None, None, None))
         self.yields = None
         self.setup = None
 
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 4c1c6effffcd5..5b66b7a762c12 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -347,6 +347,11 @@ def test_init_meta(self):
         self.assertEquals(r.max_rss, 32768)
         self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count),
                          (8, 28, 15))
+        log = '1,Ackermann,2,715,,16,9,,'  # --delta erased 0s
+        r = PerformanceTestResult(
+            log.split(','), quantiles=True, meta=True)
+        self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count),
+                         (9, 0, 0))
 
     def test_repr(self):
         log_line = '1,AngryPhonebook,20,10664,12933,11035,576,10884'

From 9f7e78288fed0bf9565fd86a7785b04b9759b512 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Sat, 3 Aug 2019 20:48:30 +0200
Subject: [PATCH 03/26] [benchmark] PerformanceTestResults merge samples

Adjusted how merged PerformanceTestResults track the number of underlying samples when using quantile subsampling.
---
 benchmark/scripts/compare_perf_tests.py      |  9 +++----
 benchmark/scripts/test_compare_perf_tests.py | 27 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 4efc16804f5da..67e8e7af09016 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -302,7 +302,8 @@ def merge(self, r):
         if self.samples and r.samples:
             map(self.samples.add, r.samples.samples)
             sams = self.samples
-            self.num_samples = sams.num_samples
+            self.num_samples += r.num_samples
+            sams.outliers += r.samples.outliers
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
         else:
@@ -517,10 +518,8 @@ def __init__(self, old_results, new_results, delta_threshold):
         self.removed = sorted([old_results[t] for t in removed_tests],
                               key=lambda r: r.name)
 
-        def compare(name):
-            return ResultComparison(old_results[name], new_results[name])
-
-        comparisons = map(compare, comparable_tests)
+        comparisons = [ResultComparison(old_results[name], new_results[name])
+                       for name in comparable_tests]
 
         def partition(l, p):
             return reduce(lambda x, y: x[not p(y)].append(y) or x, l, ([], []))
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 5b66b7a762c12..8683340edf0b5 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -390,6 +390,33 @@ def as_tuple(r):
         self.assertEqual(as_tuple(r),  # picks smaller of the setup values
                          (4, 11616, 12325, 12064, None, None, 10498048, 7))
 
+    def test_merge_with_samples(self):
+        # --quantile=20 --delta
+        log = """
+684,B,200,967,,14,5,3,3,2,1,1,,,,,1,,3,3,5,11,76,1827
+684,B,200,972,,,,,,,,2,2,3,1,,,3,6,21,30,146,694,4590
+684,B,200,986,,,1,1,,1,,,,1,,2,2,9,5,6,15,28,224,2902
+""".split('\n')[1:-1]
+        results = [
+            PerformanceTestResult(line.split(','), quantiles=True, delta=True)
+            for line in log]
+        self.assertEqual([r.num_samples for r in results], [200, 200, 200])
+        self.assertEqual(
+            [r.samples.num_samples for r in results], [21, 21, 21])
+        # after excluding outliers, the real sample count is lower
+        self.assertEqual([r.samples.count for r in results], [18, 17, 18])
+
+        def as_tuple(r):
+            return (r.num_samples, r.samples.num_samples, r.samples.count,
+                    r.min, r.samples.median, r.max)
+
+        r = results[0]
+        self.assertEqual(as_tuple(r), (200, 21, 18, 967, 996, 1008))
+        r.merge(results[1])  # 18 + 17 = 35, after merge using only ventiles
+        self.assertEqual(as_tuple(r), (400, 42, 35, 967, 983, 1010))
+        r.merge(results[2])  # 35 + 18 = 53
+        self.assertEqual(as_tuple(r), (600, 63, 53, 967, 989, 1029))
+
 
 class TestResultComparison(unittest.TestCase):
     def setUp(self):

From e3a639a8ffc50598a65c5a667061c277a5fa112d Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Mon, 16 Sep 2019 08:34:01 +0200
Subject: [PATCH 04/26] =?UTF-8?q?[benchmark]=20Don=E2=80=99t=20justify=20l?=
 =?UTF-8?q?ast=20column=20in=20reports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In text reports, don’t justify the last columns with unnecessary spaces.
---
 benchmark/scripts/compare_perf_tests.py      |  7 ++++---
 benchmark/scripts/test_compare_perf_tests.py | 22 ++++++++++----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 67e8e7af09016..e18ef0e39e2ba 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -617,7 +617,7 @@ def _column_widths(self):
         results += self.comparator.added + self.comparator.removed
 
         widths = [
-            map(len, columns) for columns in
+            map(len, row[:-1]) for row in
             [ReportFormatter.PERFORMANCE_TEST_RESULT_HEADER,
              ReportFormatter.RESULT_COMPARISON_HEADER] +
             [ReportFormatter.values(r) for r in results]
@@ -626,7 +626,7 @@ def _column_widths(self):
         def max_widths(maximum, widths):
             return map(max, zip(maximum, widths))
 
-        return reduce(max_widths, widths, [0] * 5)
+        return reduce(max_widths, widths, [0] * 4)
 
     def _formatted_text(self, label_formatter, COLUMN_SEPARATOR,
                         DELIMITER_ROW, SEPARATOR, SECTION):
@@ -634,7 +634,8 @@ def _formatted_text(self, label_formatter, COLUMN_SEPARATOR,
         self.header_printed = False
 
         def justify_columns(contents):
-            return [c.ljust(w) for w, c in zip(widths, contents)]
+            return ([c.ljust(w) for w, c in zip(widths, contents[:-1])] +
+                    [contents[-1]])
 
         def row(contents):
             return ('' if not contents else
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 8683340edf0b5..5cc97bb1b4c47 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -833,14 +833,14 @@ def test_values(self):
 
     def test_justified_columns(self):
         """Table columns are all formated with same width, defined by the
-        longest value.
+        longest value, except the last column.
         """
         self.assert_markdown_contains([
-            'AnyHashableWithAClass | 247027 | 319065 | 259056  | 10250445',
-            'Array2D               | 335831 | 335831 | +0.0%   | 1.00x'])
+            'AnyHashableWithAClass | 247027 | 319065 | 259056  | 10250445\n',
+            'Array2D               | 335831 | 335831 | +0.0%   | 1.00x\n'])
         self.assert_git_contains([
-            'AnyHashableWithAClass   247027   319065   259056    10250445',
-            'Array2D                 335831   335831   +0.0%     1.00x'])
+            'AnyHashableWithAClass   247027   319065   259056    10250445\n',
+            'Array2D                 335831   335831   +0.0%     1.00x\n'])
 
     def test_column_headers(self):
         """Report contains table headers for ResultComparisons and changed
@@ -857,9 +857,9 @@ def test_column_headers(self):
             ('TEST', 'OLD', 'NEW', 'DELTA', 'RATIO')
         )
         self.assert_markdown_contains([
-            'TEST                  | OLD    | NEW    | DELTA   | RATIO',
-            ':---                  | ---:   | ---:   | ---:    | ---:   ',
-            'TEST                  | MIN    | MAX    | MEAN    | MAX_RSS'])
+            'TEST                  | OLD    | NEW    | DELTA   | RATIO\n'
+            ':---                  | ---:   | ---:   | ---:    | ---:\n',
+            'TEST                  | MIN    | MAX    | MEAN    | MAX_RSS\n'])
         self.assert_git_contains([
             'TEST                    OLD      NEW      DELTA     RATIO',
             'TEST                    MIN      MAX      MEAN      MAX_RSS'])
@@ -881,7 +881,7 @@ def test_emphasize_speedup(self):
         self.assert_markdown_contains([
             'BitCount              | 3      | 9      | +199.9% | **0.33x**',
             'ByteSwap              | 4      | 0      | -100.0% | **4001.00x**',
-            'AngryPhonebook        | 10458  | 10458  | +0.0%   | 1.00x ',
+            'AngryPhonebook        | 10458  | 10458  | +0.0%   | 1.00x',
             'ArrayAppend           | 23641  | 20000  | -15.4%  | **1.18x (?)**'
         ])
         self.assert_git_contains([
@@ -978,8 +978,8 @@ def test_single_table_report(self):
         self.assertNotIn('): \n', git)  # no sections
         self.assertNotIn('REMOVED', git)
         self.assert_report_contains([
-            '\nREGRESSION ', ' OLD ', ' NEW ', ' DELTA ', ' RATIO ',
-            '\n\nADDED ', ' MIN ', ' MAX ', ' MEAN ', ' MAX_RSS '
+            '\nREGRESSION ', ' OLD ', ' NEW ', ' DELTA ', ' RATIO',
+            '\n\nADDED ', ' MIN ', ' MAX ', ' MEAN ', ' MAX_RSS'
         ], git)
         # Separator before every inline header (new section):
         self.assertEqual(git.count('\n\n'), 2)

From 979aced53e286edcbd414388ab961b27304203f2 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Tue, 30 Jul 2019 12:01:51 +0200
Subject: [PATCH 05/26] [benchmark] BenchmarkDriver min-samples & metadata

Support for invoking benchmark drivers with min-samples and gathering environmental metadata.
---
 benchmark/scripts/Benchmark_Driver         | 28 +++++++--------
 benchmark/scripts/test_Benchmark_Driver.py | 42 +++++++++++++++-------
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 31808852bcf22..8588cc292e2cc 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -137,37 +137,37 @@ class BenchmarkDriver(object):
 
     def run(self, test=None, num_samples=None, num_iters=None,
             sample_time=None, verbose=None, measure_memory=False,
-            quantile=None):
+            quantile=None, min_samples=None, gather_metadata=False):
         """Execute benchmark and gather results."""
-        num_samples = num_samples or 0
-        num_iters = num_iters or 0  # automatically determine N to run for 1s
-        sample_time = sample_time or 0  # default is 1s
-
         cmd = self._cmd_run(
-            test, num_samples, num_iters, sample_time,
-            verbose, measure_memory, quantile)
+            test, num_samples, num_iters, sample_time, min_samples,
+            verbose, measure_memory, quantile, gather_metadata)
         output = self._invoke(cmd)
         results = self.parser.results_from_string(output)
         return results.items()[0][1] if test else results
 
-    def _cmd_run(self, test, num_samples, num_iters, sample_time,
-                 verbose, measure_memory, quantile):
+    def _cmd_run(self, test, num_samples, num_iters, sample_time, min_samples,
+                 verbose, measure_memory, quantile, gather_metadata):
         cmd = [self.test_harness]
         if test:
             cmd.append(test)
         else:
             cmd.extend([self.test_number.get(name, name)
                         for name in self.tests])
-        if num_samples > 0:
+        if num_samples:
             cmd.append('--num-samples={0}'.format(num_samples))
-        if num_iters > 0:
+        if min_samples:
+            cmd.append('--min-samples={0}'.format(min_samples))
+        if num_iters:
             cmd.append('--num-iters={0}'.format(num_iters))
-        if sample_time > 0:
+        if sample_time:
             cmd.append('--sample-time={0}'.format(sample_time))
         if verbose:
             cmd.append('--verbose')
         if measure_memory:
             cmd.append('--memory')
+        if gather_metadata:
+            cmd.append('--meta')
         if quantile:
             cmd.append('--quantile={0}'.format(quantile))
             cmd.append('--delta')
@@ -183,8 +183,8 @@ class BenchmarkDriver(object):
             return a
 
         return reduce(merge_results,
-                      [self.run(test, measure_memory=True,
-                                num_iters=1, quantile=20)
+                      [self.run(test, num_iters=1, quantile=20,
+                                measure_memory=True, gather_metadata=True)
                        for _ in range(self.args.independent_samples)])
 
     def log_results(self, output, log_file=None):
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 32b1a9e527635..af1fd61ef5018 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -277,6 +277,11 @@ def test_run_benchmark_with_multiple_samples(self):
         self.subprocess_mock.assert_called_with(
             ('/benchmarks/Benchmark_O', 'b2', '--num-samples=5'))
 
+    def test_run_benchmark_with_minimum_samples(self):
+        self.driver.run('b', min_samples=7)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--min-samples=7'))
+
     def test_run_benchmark_with_specified_number_of_iterations(self):
         self.driver.run('b', num_iters=1)
         self.subprocess_mock.assert_called_with(
@@ -321,6 +326,11 @@ def test_measure_memory(self):
         self.subprocess_mock.assert_called_with(
             ('/benchmarks/Benchmark_O', 'b', '--memory'))
 
+    def test_gather_metadata(self):
+        self.driver.run('b', gather_metadata=True)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--meta'))
+
     def test_report_quantiles(self):
         """Use delta compression for quantile reports."""
         self.driver.run('b', quantile=4)
@@ -333,7 +343,7 @@ def test_run_benchmark_independent_samples(self):
         r = self.driver.run_independent_samples('b1')
         self.assertEqual(self.subprocess_mock.calls.count(
             ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
-             '--quantile=20', '--delta')), 3)
+             '--meta', '--quantile=20', '--delta')), 3)
         self.assertEqual(r.num_samples, 3)  # results are merged
 
     def test_run_and_log(self):
@@ -411,21 +421,26 @@ def test_deterministing_hashing(self):
 
 
 class BenchmarkDriverMock(Mock):
-    """Mock for BenchmarkDriver's `run` method"""
+    """Mock for BenchmarkDriver's `run` method."""
+
     def __init__(self, tests, responses=None):
         super(BenchmarkDriverMock, self).__init__(responses)
         self.tests = tests
         self.args = ArgsStub()
 
-        def _run(test, num_samples=None, num_iters=None,
-                 verbose=None, measure_memory=False):
-            return self.record_and_respond(test, num_samples, num_iters,
-                                           verbose, measure_memory)
+        def _run(test=None, num_samples=None, num_iters=None,
+                 sample_time=None, verbose=None, measure_memory=False,
+                 quantile=None, min_samples=None, gather_metadata=False):
+            return self._record_and_respond(
+                test, num_samples, num_iters, sample_time, min_samples,
+                verbose, measure_memory, quantile, gather_metadata)
         self.run = _run
 
-    def record_and_respond(self, test, num_samples, num_iters,
-                           verbose, measure_memory):
-        args = (test, num_samples, num_iters, verbose, measure_memory)
+    def _record_and_respond(
+        self, test, num_samples, num_iters, sample_time, min_samples,
+            verbose, measure_memory, quantile, gather_metadata):
+        args = (test, num_samples, num_iters, sample_time, min_samples,
+                verbose, measure_memory, quantile, gather_metadata)
         self.calls.append(args)
         return self.respond.get(args, _PTR(min=700))
 
@@ -520,11 +535,12 @@ def _PTR(min=700, mem_pages=1000, setup=None):
     return Stub(samples=Stub(min=min), mem_pages=mem_pages, setup=setup)
 
 
-def _run(test, num_samples=None, num_iters=None, verbose=None,
-         measure_memory=False):
+def _run(test=None, num_samples=None, num_iters=None,
+         sample_time=None, verbose=None, measure_memory=False,
+         quantile=None, min_samples=None, gather_metadata=False):
     """Helper function that constructs tuple with arguments for run method."""
-    return (
-        test, num_samples, num_iters, verbose, measure_memory)
+    return (test, num_samples, num_iters, sample_time, min_samples,
+            verbose, measure_memory, quantile, gather_metadata)
 
 
 class TestBenchmarkDoctor(unittest.TestCase):

From 6ee22de3fab6f3ea5f8089c5d5890b60546ecba9 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 13 Nov 2019 09:23:23 +0100
Subject: [PATCH 06/26] [benchmark] [Gardening] Fix assertEqual naming

---
 benchmark/scripts/test_Benchmark_Driver.py   |  8 +--
 benchmark/scripts/test_compare_perf_tests.py | 62 ++++++++++----------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index af1fd61ef5018..552dec85481cd 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -188,8 +188,8 @@ def test_gets_list_of_precommit_benchmarks(self):
                          ['Benchmark1', 'Benchmark2'])
         self.assertEqual(driver.all_tests,
                          ['Benchmark1', 'Benchmark2'])
-        self.assertEquals(driver.test_number['Benchmark1'], "1")
-        self.assertEquals(driver.test_number['Benchmark2'], "2")
+        self.assertEqual(driver.test_number['Benchmark1'], "1")
+        self.assertEqual(driver.test_number['Benchmark2'], "2")
 
     list_all_tests = (
         '/benchmarks/Benchmark_O --list --delim=\t --skip-tags='.split(' '),
@@ -316,10 +316,10 @@ def test_parse_results_from_running_benchmarks(self):
         """
         r = self.driver.run('b')
         self.assertTrue(self.parser_stub.results_from_string_called)
-        self.assertEquals(r.name, 'b1')  # non-matching name, just 1st result
+        self.assertEqual(r.name, 'b1')  # non-matching name, just 1st result
         r = self.driver.run()
         self.assertTrue(isinstance(r, dict))
-        self.assertEquals(r['b1'].name, 'b1')
+        self.assertEqual(r['b1'].name, 'b1')
 
     def test_measure_memory(self):
         self.driver.run('b', measure_memory=True)
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 5cc97bb1b4c47..3dc087a5944d5 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -105,31 +105,31 @@ def test_computes_inter_quartile_range(self):
         self.samples.add(Sample(5, 1, 1100))
         self.assertEqual(self.samples.iqr, 50)
 
-    def assertEqualtats(self, stats, expected_stats):
+    def assertEqualStats(self, stats, expected_stats):
         for actual, expected in zip(stats, expected_stats):
-            self.assertAlmostEquals(actual, expected, places=2)
+            self.assertAlmostEqual(actual, expected, places=2)
 
     def test_computes_mean_sd_cv(self):
         ss = self.samples
-        self.assertEqualtats(
+        self.assertEqualStats(
             (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
         self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualtats(
+        self.assertEqualStats(
             (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
 
     def test_computes_range_spread(self):
         ss = self.samples
-        self.assertEqualtats(
+        self.assertEqualStats(
             (ss.range, ss.spread), (0, 0))
         self.samples.add(Sample(2, 1, 1100))
-        self.assertEqualtats(
+        self.assertEqualStats(
             (ss.range, ss.spread), (100, 10.0 / 100))
 
     def test_init_with_samples(self):
         self.samples = PerformanceTestSamples(
             'B2', [Sample(0, 1, 1000), Sample(1, 1, 1100)])
         self.assertEqual(self.samples.count, 2)
-        self.assertEqualtats(
+        self.assertEqualStats(
             (self.samples.mean, self.samples.sd,
              self.samples.range, self.samples.spread),
             (1050.0, 70.71, 100, 9.52 / 100))
@@ -138,7 +138,7 @@ def test_can_handle_zero_runtime(self):
         # guard against dividing by 0
         self.samples = PerformanceTestSamples('Zero')
         self.samples.add(Sample(0, 1, 0))
-        self.assertEqualtats(
+        self.assertEqualStats(
             (self.samples.mean, self.samples.sd, self.samples.cv,
              self.samples.range, self.samples.spread),
             (0, 0, 0.0, 0, 0.0))
@@ -150,7 +150,7 @@ def test_excludes_outliers(self):
               '10 1 1050, 11 1 949, 12 1 1151'.split(',')]
         self.samples = PerformanceTestSamples('Outliers', ss)
         self.assertEqual(self.samples.count, 13)
-        self.assertEqualtats(
+        self.assertEqualStats(
             (self.samples.mean, self.samples.sd), (1050, 52.36))
 
         self.samples.exclude_outliers()
@@ -159,7 +159,7 @@ def test_excludes_outliers(self):
         self.assertEqual(self.samples.outliers, ss[11:])
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1025, 1050, 1075, 1100))
-        self.assertEqualtats(
+        self.assertEqualStats(
             (self.samples.mean, self.samples.sd), (1050, 35.36))
 
     def test_excludes_outliers_zero_IQR(self):
@@ -173,7 +173,7 @@ def test_excludes_outliers_zero_IQR(self):
         self.samples.exclude_outliers()
 
         self.assertEqual(self.samples.count, 3)
-        self.assertEqualtats(
+        self.assertEqualStats(
             (self.samples.min, self.samples.max), (18, 18))
 
     def test_excludes_outliers_top_only(self):
@@ -186,7 +186,7 @@ def test_excludes_outliers_top_only(self):
         self.samples.exclude_outliers(top_only=True)
 
         self.assertEqual(self.samples.count, 4)
-        self.assertEqualtats((self.samples.min, self.samples.max), (1, 2))
+        self.assertEqualStats((self.samples.min, self.samples.max), (1, 2))
 
 
 class TestPerformanceTestResult(unittest.TestCase):
@@ -212,8 +212,8 @@ def test_init_quantiles(self):
         self.assertEqual(r.name, 'Ackermann')
         self.assertEqual((r.num_samples, r.min, r.median, r.max),
                          (3, 54383, 54512, 54601))
-        self.assertAlmostEquals(r.mean, 54498.67, places=2)
-        self.assertAlmostEquals(r.sd, 109.61, places=2)
+        self.assertAlmostEqual(r.mean, 54498.67, places=2)
+        self.assertAlmostEqual(r.sd, 109.61, places=2)
         self.assertEqual(r.samples.count, 3)
         self.assertEqual(r.samples.num_samples, 3)
         self.assertEqual([s.runtime for s in r.samples.all_samples],
@@ -344,7 +344,7 @@ def test_init_meta(self):
         self.assertEqual((r.num_samples, r.min, r.max), (200, 715, 1259))
         self.assertEqual((r.samples.count, r.samples.min, r.samples.max),
                          (2, 715, 1259))
-        self.assertEquals(r.max_rss, 32768)
+        self.assertEqual(r.max_rss, 32768)
         self.assertEqual((r.mem_pages, r.involuntary_cs, r.yield_count),
                          (8, 28, 15))
         log = '1,Ackermann,2,715,,16,9,,'  # --delta erased 0s
@@ -432,21 +432,21 @@ def setUp(self):
     def test_init(self):
         rc = ResultComparison(self.r1, self.r2)
         self.assertEqual(rc.name, 'AngryPhonebook')
-        self.assertAlmostEquals(rc.ratio, 12325.0 / 11616.0)
-        self.assertAlmostEquals(rc.delta, (((11616.0 / 12325.0) - 1) * 100),
-                                places=3)
+        self.assertAlmostEqual(rc.ratio, 12325.0 / 11616.0)
+        self.assertAlmostEqual(rc.delta, (((11616.0 / 12325.0) - 1) * 100),
+                               places=3)
         # handle test results that sometimes change to zero, when compiler
         # optimizes out the body of the incorrectly written test
         rc = ResultComparison(self.r0, self.r0)
         self.assertEqual(rc.name, 'GlobalClass')
-        self.assertAlmostEquals(rc.ratio, 1)
-        self.assertAlmostEquals(rc.delta, 0, places=3)
+        self.assertAlmostEqual(rc.ratio, 1)
+        self.assertAlmostEqual(rc.delta, 0, places=3)
         rc = ResultComparison(self.r0, self.r01)
-        self.assertAlmostEquals(rc.ratio, 0, places=3)
-        self.assertAlmostEquals(rc.delta, 2000000, places=3)
+        self.assertAlmostEqual(rc.ratio, 0, places=3)
+        self.assertAlmostEqual(rc.delta, 2000000, places=3)
         rc = ResultComparison(self.r01, self.r0)
-        self.assertAlmostEquals(rc.ratio, 20001)
-        self.assertAlmostEquals(rc.delta, -99.995, places=3)
+        self.assertAlmostEqual(rc.ratio, 20001)
+        self.assertAlmostEqual(rc.delta, -99.995, places=3)
         # disallow comparison of different test results
         self.assertRaises(
             AssertionError,
@@ -526,11 +526,11 @@ def test_parse_results_csv(self):
         parser = LogParser()
         results = parser.parse_results(log.splitlines())
         self.assertTrue(isinstance(results[0], PerformanceTestResult))
-        self.assertEquals(results[0].name, 'Array.append.Array.Int?')
-        self.assertEquals(results[1].name,
-                          'Bridging.NSArray.as!.Array.NSString')
-        self.assertEquals(results[2].name,
-                          'Flatten.Array.Tuple4.lazy.for-in.Reserve')
+        self.assertEqual(results[0].name, 'Array.append.Array.Int?')
+        self.assertEqual(results[1].name,
+                         'Bridging.NSArray.as!.Array.NSString')
+        self.assertEqual(results[2].name,
+                         'Flatten.Array.Tuple4.lazy.for-in.Reserve')
 
     def test_parse_results_tab_delimited(self):
         log = '34\tBitCount\t20\t3\t4\t4\t0\t4'
@@ -715,8 +715,8 @@ def test_results_from_merge_verbose(self):
         self.assertEqual(result.min, 350815)
         self.assertEqual(result.max, 376131)
         self.assertEqual(result.median, 358817)
-        self.assertAlmostEquals(result.sd, 8443.37, places=2)
-        self.assertAlmostEquals(result.mean, 361463.25, places=2)
+        self.assertAlmostEqual(result.sd, 8443.37, places=2)
+        self.assertAlmostEqual(result.mean, 361463.25, places=2)
         self.assertEqual(result.num_samples, 8)
         samples = result.samples
         self.assertTrue(isinstance(samples, PerformanceTestSamples))

From ed5940edc4ab5054e6896aaf3bb8fbf2039dfcb8 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 19 Feb 2020 20:56:34 +0100
Subject: [PATCH 07/26] [benchmark] Report ventiles for dubious results

For dubious result comparisons, print out empirical sample distribution (ventiles) to enable humans to reach informed decisions about these performance changes.
---
 benchmark/scripts/compare_perf_tests.py      | 60 +++++++++++++++++---
 benchmark/scripts/test_compare_perf_tests.py | 46 ++++++++++++---
 2 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index e18ef0e39e2ba..d4c21682724cd 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -568,10 +568,12 @@ def header_for(result):
                 ReportFormatter.RESULT_COMPARISON_HEADER)
 
     @staticmethod
-    def values(result):
+    def values(result, dubious_formatter=lambda r: ' (?)'):
         """Format values from PerformanceTestResult or ResultComparison.
 
-        Returns tuple of strings to display in the results table.
+        Returns tuple of strings to display in the results table. Uses the
+        supplied `dubious_formatter` to display the empirical sample
+        distribution of the dubious result comparison.
         """
         return (
             (result.name,
@@ -582,14 +584,22 @@ def values(result):
             (result.name,
              str(result.old.min), str(result.new.min),
              '{0:+.1f}%'.format(result.delta),
-             '{0:.2f}x{1}'.format(result.ratio,
-                                  ' (?)' if result.is_dubious else ''))
+             '{0:.2f}x{1}'.format(
+                 result.ratio,
+                 dubious_formatter(result) if result.is_dubious else ''))
         )
 
     def markdown(self):
         """Report results of benchmark comparisons in Markdown format."""
         return self._formatted_text(
             label_formatter=lambda s: ('**' + s + '**'),
+            ventile_formatter=lambda r: ReportFormatter.ventiles(
+                r,
+                START='</tr><tr><td colspan=5><code>O: <sub>',
+                MIDDLE='</sub></code><br/><code>N: <sup>',
+                END='</sup></code></td></tr>',
+                OLD_QUARTILE='</sub>{0}<sub>',
+                NEW_QUARTILE='</sup>{0}<sup>'),
             COLUMN_SEPARATOR=' | ',
             DELIMITER_ROW=([':---'] + ['---:'] * 4),
             SEPARATOR='&nbsp; | | | | \n',
@@ -604,12 +614,39 @@ def git(self):
         """Report results of benchmark comparisons in 'git' format."""
         return self._formatted_text(
             label_formatter=lambda s: s.upper(),
+            ventile_formatter=lambda r: ReportFormatter.ventiles(
+                r,
+                START='\n  O: ',
+                MIDDLE='\n  N: ',
+                END='',
+                OLD_QUARTILE=' {0} ',
+                NEW_QUARTILE=' {0} '),
             COLUMN_SEPARATOR='   ',
             DELIMITER_ROW=None,
             SEPARATOR='\n',
             SECTION="""
 {0} ({1}): \n{2}""")
 
+    @staticmethod
+    def ventiles(result, START, MIDDLE, END, OLD_QUARTILE, NEW_QUARTILE):
+        v = ' (?)'
+        if not (result.old.samples and result.new.samples):
+            return v
+
+        def ventiles(samples, QUARTILE):
+            vs = [str(samples.quantile(ventile)) for ventile in
+                  [v / 100.0 for v in range(5, 100, 5)]]
+            for i in [4, 9, 14]:
+                vs[i] = QUARTILE.format(vs[i])
+            return ' '.join(vs)
+
+        v += START
+        v += ventiles(result.old.samples, OLD_QUARTILE)
+        v += MIDDLE
+        v += ventiles(result.new.samples, NEW_QUARTILE)
+        v += END
+        return v
+
     def _column_widths(self):
         changed = self.comparator.decreased + self.comparator.increased
         results = (changed if self.changes_only else
@@ -628,8 +665,8 @@ def max_widths(maximum, widths):
 
         return reduce(max_widths, widths, [0] * 4)
 
-    def _formatted_text(self, label_formatter, COLUMN_SEPARATOR,
-                        DELIMITER_ROW, SEPARATOR, SECTION):
+    def _formatted_text(self, label_formatter, ventile_formatter,
+                        COLUMN_SEPARATOR, DELIMITER_ROW, SEPARATOR, SECTION):
         widths = self._column_widths()
         self.header_printed = False
 
@@ -651,15 +688,20 @@ def header(title, column_labels):
                 self.header_printed = True
             return h
 
+        def bold_first(value):
+            first, sep, rest = value.partition(' ')
+            return '**' + first + '**' + sep + rest
+
         def format_columns(r, is_strong):
             return (r if not is_strong else
-                    r[:-1] + ('**' + r[-1] + '**', ))
+                    r[:-1] + (bold_first(r[-1]), ))
 
         def table(title, results, is_strong=False, is_open=False):
             if not results:
                 return ''
-            rows = [row(format_columns(ReportFormatter.values(r), is_strong))
-                    for r in results]
+            rows = [row(format_columns(
+                ReportFormatter.values(r, ventile_formatter), is_strong))
+                for r in results]
             table = (header(title if self.single_table else '',
                             ReportFormatter.header_for(results[0])) +
                      ''.join(rows))
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 3dc087a5944d5..f0ae78b4c1416 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -507,6 +507,13 @@ class OldAndNewLog(unittest.TestCase):
                              for line in
                              new_log_content.splitlines()])])
 
+    old_results['D'] = PerformanceTestResult(
+        '184,D,200,648,4,1,5,9,5,3,45,40,3,1,,,,1,1,,4,4,4,268'.split(','),
+        quantiles=True, delta=True)
+    new_results['D'] = PerformanceTestResult(
+        '184,D,200,746,1,3,1,3,2,1,,2,3,1,,1,1,3,1,3,7,9,61,1792'.split(','),
+        quantiles=True, delta=True)
+
     def assert_report_contains(self, texts, report):
         assert not isinstance(texts, str)
         for text in texts:
@@ -761,13 +768,13 @@ def names(tests):
         tc = TestComparator(self.old_results, self.new_results, 0.05)
         self.assertEqual(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
         self.assertEqual(names(tc.increased), ['ByteSwap', 'ArrayAppend'])
-        self.assertEqual(names(tc.decreased), ['BitCount'])
+        self.assertEqual(names(tc.decreased), ['BitCount', 'D'])
         self.assertEqual(names(tc.added), ['TwoSum'])
         self.assertEqual(names(tc.removed), ['AnyHashableWithAClass'])
         # other way around
         tc = TestComparator(self.new_results, self.old_results, 0.05)
         self.assertEqual(names(tc.unchanged), ['AngryPhonebook', 'Array2D'])
-        self.assertEqual(names(tc.increased), ['BitCount'])
+        self.assertEqual(names(tc.increased), ['BitCount', 'D'])
         self.assertEqual(names(tc.decreased), ['ByteSwap', 'ArrayAppend'])
         self.assertEqual(names(tc.added), ['AnyHashableWithAClass'])
         self.assertEqual(names(tc.removed), ['TwoSum'])
@@ -776,7 +783,7 @@ def names(tests):
         tc = TestComparator(self.old_results, self.new_results, 1)
         self.assertEqual(
             names(tc.unchanged),
-            ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount']
+            ['AngryPhonebook', 'Array2D', 'ArrayAppend', 'BitCount', 'D']
         )
         self.assertEqual(names(tc.increased), ['ByteSwap'])
         self.assertEqual(tc.decreased, [])
@@ -876,19 +883,19 @@ def test_column_headers(self):
                 <th align='left'>MAX_RSS</th>"""])
 
     def test_emphasize_speedup(self):
-        """Emphasize speedup values for regressions and improvements"""
+        """Emphasize speedup values for regressions and improvements."""
         # tests in No Changes don't have emphasized speedup
         self.assert_markdown_contains([
             'BitCount              | 3      | 9      | +199.9% | **0.33x**',
             'ByteSwap              | 4      | 0      | -100.0% | **4001.00x**',
             'AngryPhonebook        | 10458  | 10458  | +0.0%   | 1.00x',
-            'ArrayAppend           | 23641  | 20000  | -15.4%  | **1.18x (?)**'
+            'ArrayAppend           | 23641  | 20000  | -15.4%  | **1.18x** (?)'
         ])
         self.assert_git_contains([
             'BitCount                3        9        +199.9%   **0.33x**',
             'ByteSwap                4        0        -100.0%   **4001.00x**',
             'AngryPhonebook          10458    10458    +0.0%     1.00x',
-            'ArrayAppend             23641    20000    -15.4%    **1.18x (?)**'
+            'ArrayAppend             23641    20000    -15.4%    **1.18x** (?)'
         ])
         self.assert_html_contains([
             """
@@ -917,11 +924,32 @@ def test_emphasize_speedup(self):
         </tr>"""
         ])
 
+    def test_print_quantiles_for_dubious_changes_with_samples(self):
+        self.assert_markdown_contains([
+            'D                     | 648    | 746    | +15.1%  | **0.87x** (?)'
+            '</tr><tr><td colspan=5>'
+            '<code>O: <sub>648 652 653 658 </sub>667<sub> 672 675 720 760 '
+            '</sub>763<sub> 764 764 764 764 </sub>765<sub> 766 766 770 774'
+            '</sub></code><br/>'
+            '<code>N: <sup>746 747 750 751 </sup>754<sup> 756 757 757 759 '
+            '</sup>762<sup> 763 763 764 765 </sup>768<sup> 769 772 779 788'
+            '</sup></code></td></tr>'
+            '\n'
+        ])
+        self.assert_git_contains([
+            'D                       648      746      +15.1%    **0.87x** (?)'
+            '\n  O: 648 652 653 658  667  672 675 720 760 '  # ventiles, old
+            ' 763  764 764 764 764  765  766 766 770 774'
+            '\n  N: 746 747 750 751  754  756 757 757 759 '  # ventiles, new
+            ' 762  763 763 764 765  768  769 772 779 788'
+            '\n'
+        ])
+
     def test_sections(self):
         """Report is divided into sections with summaries."""
         self.assert_markdown_contains([
             """<details open>
-  <summary>Regression (1)</summary>""",
+  <summary>Regression (2)</summary>""",
             """<details >
   <summary>Improvement (2)</summary>""",
             """<details >
@@ -931,13 +959,13 @@ def test_sections(self):
             """<details open>
   <summary>Removed (1)</summary>"""])
         self.assert_git_contains([
-            'Regression (1): \n',
+            'Regression (2): \n',
             'Improvement (2): \n',
             'No Changes (2): \n',
             'Added (1): \n',
             'Removed (1): \n'])
         self.assert_html_contains([
-            "<th align='left'>Regression (1)</th>",
+            "<th align='left'>Regression (2)</th>",
             "<th align='left'>Improvement (2)</th>",
             "<th align='left'>No Changes (2)</th>",
             "<th align='left'>Added (1)</th>",

From ef2993aca5738381f3b77af6fa0a888059fc6265 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 13 Nov 2019 09:06:35 +0100
Subject: [PATCH 08/26] [benchmark] Retire old num-iters in verbose format

After commit 331c0bf772a4626aa1f8e1f8aae7629e10d350db from a year ago, all samples from the same run have the same num-iters.
---
 benchmark/scripts/test_compare_perf_tests.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index f0ae78b4c1416..460e392ad9eea 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -632,7 +632,6 @@ def test_parse_results_verbose(self):
 Running AngryPhonebook for 3 samples.
     Measuring with scale 78.
     Sample 0,11812
-    Measuring with scale 90.
     Sample 1,13898
     Sample 2,11467
 1,AngryPhonebook,3,11467,13898,12392,1315,11812
@@ -656,7 +655,7 @@ def test_parse_results_verbose(self):
         )
         self.assertEqual(r.num_samples, r.samples.num_samples)
         self.assertEqual(results[0].samples.all_samples,
-                         [(0, 78, 11812), (1, 90, 13898), (2, 90, 11467)])
+                         [(0, 78, 11812), (1, 78, 13898), (2, 78, 11467)])
         self.assertEqual(r.yields, None)
 
         r = results[1]
@@ -731,25 +730,15 @@ def test_results_from_merge_verbose(self):
 
     def test_excludes_outliers_from_samples(self):
         verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.
-    Measuring with scale 2.
     Sample 0,455
-    Measuring with scale 2.
     Sample 1,203
-    Measuring with scale 2.
     Sample 2,205
-    Measuring with scale 2.
     Sample 3,207
-    Measuring with scale 2.
     Sample 4,208
-    Measuring with scale 2.
     Sample 5,206
-    Measuring with scale 2.
     Sample 6,205
-    Measuring with scale 2.
     Sample 7,206
-    Measuring with scale 2.
     Sample 8,208
-    Measuring with scale 2.
     Sample 9,184
 65,DropFirstAnySeqCntRangeLazy,10,184,455,228,79,206
 """

From 05b20098d55f6522ed95ac2e0c91a59923fea70a Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 13 Nov 2019 12:59:42 +0100
Subject: [PATCH 09/26] [benchmark] Add tests for all_samples

---
 benchmark/scripts/test_compare_perf_tests.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 460e392ad9eea..ab9c490a79e9b 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -157,6 +157,7 @@ def test_excludes_outliers(self):
 
         self.assertEqual(self.samples.count, 11)
         self.assertEqual(self.samples.outliers, ss[11:])
+        self.assertEqual(self.samples.all_samples, ss)
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1025, 1050, 1075, 1100))
         self.assertEqualStats(
@@ -175,6 +176,9 @@ def test_excludes_outliers_zero_IQR(self):
         self.assertEqual(self.samples.count, 3)
         self.assertEqualStats(
             (self.samples.min, self.samples.max), (18, 18))
+        self.assertEqual(self.samples.all_samples,
+                         [Sample(0, 2, 23), Sample(1, 2, 18),
+                          Sample(2, 2, 18), Sample(3, 2, 18)])
 
     def test_excludes_outliers_top_only(self):
         ss = [Sample(*map(int, s.split())) for s in

From 5342ab3b95622263701c30aa79a1c31f90678e82 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 13 Nov 2019 13:24:10 +0100
Subject: [PATCH 10/26] [benchmark] Refactor: store num_iters on PTS

Store the number of iterations averaged in each sample on the PerformanceTestSamples.
---
 benchmark/scripts/compare_perf_tests.py      | 10 ++++++----
 benchmark/scripts/test_compare_perf_tests.py | 10 ++++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index d4c21682724cd..a5a4a2b93c679 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -65,9 +65,10 @@ class PerformanceTestSamples(object):
     Computes the sample population statistics.
     """
 
-    def __init__(self, name, samples=None):
+    def __init__(self, name, samples=None, num_iters=None):
         """Initialize with benchmark name and optional list of Samples."""
         self.name = name  # Name of the performance test
+        self.num_iters = num_iters  # Number of iterations averaged in sample
         self.samples = []
         self.outliers = []
         self._runtimes = []
@@ -119,7 +120,7 @@ def exclude_outliers(self, top_only=False):
         outliers = self.samples[:lo] + self.samples[hi:]
         samples = self.samples[lo:hi]
 
-        self.__init__(self.name)  # re-initialize
+        self.__init__(self.name, num_iters=self.num_iters)  # re-initialize
         for sample in samples:  # and
             self.add(sample)  # re-compute stats
         self.outliers = outliers
@@ -388,7 +389,8 @@ def _append_result(self, result):
         r.voluntary_cs = self.voluntary_cs
         r.involuntary_cs = r.involuntary_cs or self.involuntary_cs
         if self.samples:
-            r.samples = PerformanceTestSamples(r.name, self.samples)
+            r.samples = PerformanceTestSamples(
+                r.name, self.samples, self.num_iters)
             r.samples.exclude_outliers()
         self.results.append(r)
         r.yields = self.yields or None
@@ -411,7 +413,7 @@ def _configure_format(self, header):
         # Verbose mode adds new productions:
         # Adaptively determined N; test loop multiple adjusting runtime to ~1s
         re.compile(r'\s+Measuring with scale (\d+).'):
-        (lambda self, num_iters: setattr(self, 'num_iters', num_iters)),
+        (lambda self, num_iters: setattr(self, 'num_iters', int(num_iters))),
 
         re.compile(r'\s+Sample (\d+),(\d+)'):
         (lambda self, i, runtime:
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index ab9c490a79e9b..9d1307ffdd922 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -62,6 +62,11 @@ def test_stores_samples(self):
         self.assertEqual(s.num_iters, 42)
         self.assertEqual(s.runtime, 1000)
 
+    def test_num_iters(self):
+        self.assertIsNone(self.samples.num_iters)
+        averaged = PerformanceTestSamples('B1', num_iters=42)
+        self.assertEqual(averaged.num_iters, 42)
+
     def test_quantile(self):
         self.assertEqual(self.samples.quantile(1), 1000)
         self.assertEqual(self.samples.quantile(0), 1000)
@@ -658,7 +663,8 @@ def test_parse_results_verbose(self):
             ('AngryPhonebook', 11467, 13898, 12392, 1315, 11812)
         )
         self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(results[0].samples.all_samples,
+        self.assertEqual(r.samples.num_iters, 78)
+        self.assertEqual(r.samples.all_samples,
                          [(0, 78, 11812), (1, 78, 13898), (2, 78, 11467)])
         self.assertEqual(r.yields, None)
 
@@ -669,7 +675,7 @@ def test_parse_results_verbose(self):
         )
         self.assertEqual(r.setup, 14444)
         self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(results[1].samples.all_samples,
+        self.assertEqual(r.samples.all_samples,
                          [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
         yielded = r.yields[0]
         self.assertEqual(yielded.before_sample, 1)

From 15dcdaf831c06157f6eefbe99b2af7f31786788c Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 13 Nov 2019 14:55:14 +0100
Subject: [PATCH 11/26] [benchmark] Refactor: remove class Sample

Removed Sample class, that was previously holding num_iters and the ordinal number of the sample.
---
 benchmark/scripts/compare_perf_tests.py      | 42 ++++------
 benchmark/scripts/test_compare_perf_tests.py | 88 +++++++-------------
 2 files changed, 45 insertions(+), 85 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index a5a4a2b93c679..8e6ff7af3131e 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -17,7 +17,6 @@
 
 Invoke `$ compare_perf_tests.py -h ` for complete list of options.
 
-class `Sample` is single benchmark measurement.
 class `PerformanceTestSamples` is collection of `Sample`s and their statistics.
 class `PerformanceTestResult` is a summary of performance test execution.
 class `LogParser` converts log files into `PerformanceTestResult`s.
@@ -37,20 +36,6 @@ class `ReportFormatter` creates the test comparison report in specified format.
 from math import ceil, sqrt
 
 
-class Sample(namedtuple('Sample', 'i num_iters runtime')):
-    u"""Single benchmark measurement.
-
-    Initialized with:
-    `i`: ordinal number of the sample taken,
-    `num-num_iters`:  number or iterations used to compute it,
-    `runtime`: in microseconds (μs).
-    """
-
-    def __repr__(self):
-        """Shorter Sample formating for debugging purposes."""
-        return 's({0.i!r}, {0.num_iters!r}, {0.runtime!r})'.format(self)
-
-
 class Yield(namedtuple('Yield', 'before_sample after')):
     u"""Meta-measurement of when the Benchmark_X voluntarily yielded process.
 
@@ -70,6 +55,7 @@ def __init__(self, name, samples=None, num_iters=None):
         self.name = name  # Name of the performance test
         self.num_iters = num_iters  # Number of iterations averaged in sample
         self.samples = []
+        self._all_samples = []
         self.outliers = []
         self._runtimes = []
         self.mean = 0.0
@@ -90,16 +76,17 @@ def __str__(self):
 
     def add(self, sample):
         """Add sample to collection and recompute statistics."""
-        assert isinstance(sample, Sample)
+        assert isinstance(sample, int)
         self._update_stats(sample)
-        i = bisect(self._runtimes, sample.runtime)
-        self._runtimes.insert(i, sample.runtime)
+        i = bisect(self._runtimes, sample)
+        self._runtimes.insert(i, sample)
         self.samples.insert(i, sample)
+        self._all_samples.append(sample)
 
     def _update_stats(self, sample):
         old_stats = (self.count, self.mean, self.S_runtime)
         _, self.mean, self.S_runtime = (
-            self.running_mean_variance(old_stats, sample.runtime))
+            self.running_mean_variance(old_stats, sample))
 
     def exclude_outliers(self, top_only=False):
         """Exclude outliers by applying Interquartile Range Rule.
@@ -119,11 +106,13 @@ def exclude_outliers(self, top_only=False):
 
         outliers = self.samples[:lo] + self.samples[hi:]
         samples = self.samples[lo:hi]
+        all = self._all_samples
 
         self.__init__(self.name, num_iters=self.num_iters)  # re-initialize
         for sample in samples:  # and
             self.add(sample)  # re-compute stats
         self.outliers = outliers
+        self._all_samples = all
 
     @property
     def count(self):
@@ -138,17 +127,17 @@ def num_samples(self):
     @property
     def all_samples(self):
         """List of all samples in ascending order."""
-        return sorted(self.samples + self.outliers, key=lambda s: s.i)
+        return self._all_samples
 
     @property
     def min(self):
         """Minimum sampled value."""
-        return self.samples[0].runtime
+        return self.samples[0]
 
     @property
     def max(self):
         """Maximum sampled value."""
-        return self.samples[-1].runtime
+        return self.samples[-1]
 
     def quantile(self, q):
         """Return runtime for given quantile.
@@ -157,7 +146,7 @@ def quantile(self, q):
         https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample
         """
         index = max(0, int(ceil(self.count * float(q))) - 1)
-        return self.samples[index].runtime
+        return self.samples[index]
 
     @property
     def median(self):
@@ -257,8 +246,7 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False,
                             for i in range(0, self.num_samples)]
 
             self.samples = PerformanceTestSamples(
-                self.name,
-                [Sample(None, None, int(runtime)) for runtime in runtimes])
+                self.name, [int(runtime) for runtime in runtimes])
             self.samples.exclude_outliers(top_only=True)
             sams = self.samples
             self.min, self.max, self.median, self.mean, self.sd = \
@@ -416,9 +404,7 @@ def _configure_format(self, header):
         (lambda self, num_iters: setattr(self, 'num_iters', int(num_iters))),
 
         re.compile(r'\s+Sample (\d+),(\d+)'):
-        (lambda self, i, runtime:
-         self.samples.append(
-             Sample(int(i), int(self.num_iters), int(runtime)))),
+        (lambda self, i, runtime: self.samples.append(int(runtime))),
 
         re.compile(r'\s+SetUp (\d+)'):
         (lambda self, setup: setattr(self, 'setup', int(setup))),
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 9d1307ffdd922..fa4e7c054afd5 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -24,7 +24,6 @@
 from compare_perf_tests import PerformanceTestSamples
 from compare_perf_tests import ReportFormatter
 from compare_perf_tests import ResultComparison
-from compare_perf_tests import Sample
 from compare_perf_tests import TestComparator
 from compare_perf_tests import main
 from compare_perf_tests import parse_args
@@ -32,35 +31,17 @@
 from test_utils import captured_output
 
 
-class TestSample(unittest.TestCase):
-    def test_has_named_fields(self):
-        s = Sample(1, 2, 3)
-        self.assertEqual(s.i, 1)
-        self.assertEqual(s.num_iters, 2)
-        self.assertEqual(s.runtime, 3)
-
-    def test_is_iterable(self):
-        s = Sample(1, 2, 3)
-        self.assertEqual(s[0], 1)
-        self.assertEqual(s[1], 2)
-        self.assertEqual(s[2], 3)
-
-
 class TestPerformanceTestSamples(unittest.TestCase):
     def setUp(self):
         self.samples = PerformanceTestSamples('B1')
-        self.samples.add(Sample(7, 42, 1000))
+        self.samples.add(1000)
 
     def test_has_name(self):
         self.assertEqual(self.samples.name, 'B1')
 
     def test_stores_samples(self):
         self.assertEqual(self.samples.count, 1)
-        s = self.samples.samples[0]
-        self.assertTrue(isinstance(s, Sample))
-        self.assertEqual(s.i, 7)
-        self.assertEqual(s.num_iters, 42)
-        self.assertEqual(s.runtime, 1000)
+        self.assertEqual(self.samples.samples[0], 1000)
 
     def test_num_iters(self):
         self.assertIsNone(self.samples.num_iters)
@@ -70,10 +51,10 @@ def test_num_iters(self):
     def test_quantile(self):
         self.assertEqual(self.samples.quantile(1), 1000)
         self.assertEqual(self.samples.quantile(0), 1000)
-        self.samples.add(Sample(2, 1, 1100))
+        self.samples.add(1100)
         self.assertEqual(self.samples.quantile(0), 1000)
         self.assertEqual(self.samples.quantile(1), 1100)
-        self.samples.add(Sample(3, 1, 1050))
+        self.samples.add(1050)
         self.assertEqual(self.samples.quantile(0), 1000)
         self.assertEqual(self.samples.quantile(.5), 1050)
         self.assertEqual(self.samples.quantile(1), 1100)
@@ -89,25 +70,25 @@ def assertEqualFiveNumberSummary(self, ss, expected_fns):
     def test_computes_five_number_summary(self):
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1000, 1000, 1000, 1000))
-        self.samples.add(Sample(2, 1, 1100))
+        self.samples.add(1100)
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1000, 1000, 1100, 1100))
-        self.samples.add(Sample(3, 1, 1050))
+        self.samples.add(1050)
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1000, 1050, 1100, 1100))
-        self.samples.add(Sample(4, 1, 1025))
+        self.samples.add(1025)
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1000, 1025, 1050, 1100))
-        self.samples.add(Sample(5, 1, 1075))
+        self.samples.add(1075)
         self.assertEqualFiveNumberSummary(
             self.samples, (1000, 1025, 1050, 1075, 1100))
 
     def test_computes_inter_quartile_range(self):
         self.assertEqual(self.samples.iqr, 0)
-        self.samples.add(Sample(2, 1, 1025))
-        self.samples.add(Sample(3, 1, 1050))
-        self.samples.add(Sample(4, 1, 1075))
-        self.samples.add(Sample(5, 1, 1100))
+        self.samples.add(1025)
+        self.samples.add(1050)
+        self.samples.add(1075)
+        self.samples.add(1100)
         self.assertEqual(self.samples.iqr, 50)
 
     def assertEqualStats(self, stats, expected_stats):
@@ -118,7 +99,7 @@ def test_computes_mean_sd_cv(self):
         ss = self.samples
         self.assertEqualStats(
             (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
-        self.samples.add(Sample(2, 1, 1100))
+        self.samples.add(1100)
         self.assertEqualStats(
             (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
 
@@ -126,13 +107,13 @@ def test_computes_range_spread(self):
         ss = self.samples
         self.assertEqualStats(
             (ss.range, ss.spread), (0, 0))
-        self.samples.add(Sample(2, 1, 1100))
+        self.samples.add(1100)
         self.assertEqualStats(
             (ss.range, ss.spread), (100, 10.0 / 100))
 
     def test_init_with_samples(self):
         self.samples = PerformanceTestSamples(
-            'B2', [Sample(0, 1, 1000), Sample(1, 1, 1100)])
+            'B2', [1000, 1100])
         self.assertEqual(self.samples.count, 2)
         self.assertEqualStats(
             (self.samples.mean, self.samples.sd,
@@ -142,17 +123,16 @@ def test_init_with_samples(self):
     def test_can_handle_zero_runtime(self):
         # guard against dividing by 0
         self.samples = PerformanceTestSamples('Zero')
-        self.samples.add(Sample(0, 1, 0))
+        self.samples.add(0)
         self.assertEqualStats(
             (self.samples.mean, self.samples.sd, self.samples.cv,
              self.samples.range, self.samples.spread),
             (0, 0, 0.0, 0, 0.0))
 
     def test_excludes_outliers(self):
-        ss = [Sample(*map(int, s.split())) for s in
-              '0 1 1000, 1 1 1025, 2 1 1050, 3 1 1075, 4 1 1100, '
-              '5 1 1000, 6 1 1025, 7 1 1050, 8 1 1075, 9 1 1100, '
-              '10 1 1050, 11 1 949, 12 1 1151'.split(',')]
+        ss = [1000, 1025, 1050, 1075, 1100,
+              1000, 1025, 1050, 1075, 1100,
+              1050, 949, 1151]
         self.samples = PerformanceTestSamples('Outliers', ss)
         self.assertEqual(self.samples.count, 13)
         self.assertEqualStats(
@@ -170,10 +150,10 @@ def test_excludes_outliers(self):
 
     def test_excludes_outliers_zero_IQR(self):
         self.samples = PerformanceTestSamples('Tight')
-        self.samples.add(Sample(0, 2, 23))
-        self.samples.add(Sample(1, 2, 18))
-        self.samples.add(Sample(2, 2, 18))
-        self.samples.add(Sample(3, 2, 18))
+        self.samples.add(23)
+        self.samples.add(18)
+        self.samples.add(18)
+        self.samples.add(18)
         self.assertEqual(self.samples.iqr, 0)
 
         self.samples.exclude_outliers()
@@ -181,14 +161,10 @@ def test_excludes_outliers_zero_IQR(self):
         self.assertEqual(self.samples.count, 3)
         self.assertEqualStats(
             (self.samples.min, self.samples.max), (18, 18))
-        self.assertEqual(self.samples.all_samples,
-                         [Sample(0, 2, 23), Sample(1, 2, 18),
-                          Sample(2, 2, 18), Sample(3, 2, 18)])
+        self.assertEqual(self.samples.all_samples, [23, 18, 18, 18])
 
     def test_excludes_outliers_top_only(self):
-        ss = [Sample(*map(int, s.split())) for s in
-              '0 1 1, 1 1 2, 2 1 2, 3 1 2, 4 1 3'.split(',')]
-        self.samples = PerformanceTestSamples('Top', ss)
+        self.samples = PerformanceTestSamples('Top', [1, 2, 2, 2, 3])
         self.assertEqualFiveNumberSummary(self.samples, (1, 2, 2, 2, 3))
         self.assertEqual(self.samples.iqr, 0)
 
@@ -225,7 +201,7 @@ def test_init_quantiles(self):
         self.assertAlmostEqual(r.sd, 109.61, places=2)
         self.assertEqual(r.samples.count, 3)
         self.assertEqual(r.samples.num_samples, 3)
-        self.assertEqual([s.runtime for s in r.samples.all_samples],
+        self.assertEqual(r.samples.all_samples,
                          [54383, 54512, 54601])
 
         # #,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
@@ -275,7 +251,7 @@ def validatePTR(deq):  # construct from delta encoded quantiles string
             r = PerformanceTestResult(['0', 'B', str(num_samples)] + deq,
                                       quantiles=True, delta=True)
             self.assertEqual(r.samples.num_samples, num_samples)
-            self.assertEqual([s.runtime for s in r.samples.all_samples],
+            self.assertEqual(r.samples.all_samples,
                              range(1, num_samples + 1))
 
         delta_encoded_quantiles = """
@@ -575,12 +551,12 @@ def test_parse_quantiles(self):
         r = LogParser.results_from_string(
             """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs)
 1,Ackermann,3,54383,54512,54601""")['Ackermann']
-        self.assertEqual([s.runtime for s in r.samples.all_samples],
+        self.assertEqual(r.samples.all_samples,
                          [54383, 54512, 54601])
         r = LogParser.results_from_string(
             """#,TEST,SAMPLES,MIN(μs),MEDIAN(μs),MAX(μs),MAX_RSS(B)
 1,Ackermann,3,54529,54760,55807,266240""")['Ackermann']
-        self.assertEqual([s.runtime for s in r.samples.all_samples],
+        self.assertEqual(r.samples.all_samples,
                          [54529, 54760, 55807])
         self.assertEqual(r.max_rss, 266240)
 
@@ -664,8 +640,7 @@ def test_parse_results_verbose(self):
         )
         self.assertEqual(r.num_samples, r.samples.num_samples)
         self.assertEqual(r.samples.num_iters, 78)
-        self.assertEqual(r.samples.all_samples,
-                         [(0, 78, 11812), (1, 78, 13898), (2, 78, 11467)])
+        self.assertEqual(r.samples.all_samples, [11812, 13898, 11467])
         self.assertEqual(r.yields, None)
 
         r = results[1]
@@ -675,8 +650,7 @@ def test_parse_results_verbose(self):
         )
         self.assertEqual(r.setup, 14444)
         self.assertEqual(r.num_samples, r.samples.num_samples)
-        self.assertEqual(r.samples.all_samples,
-                         [(0, 1, 369900), (1, 1, 381039), (2, 1, 371043)])
+        self.assertEqual(r.samples.all_samples, [369900, 381039, 371043])
         yielded = r.yields[0]
         self.assertEqual(yielded.before_sample, 1)
         self.assertEqual(yielded.after, 369918)

From 6f0eb7b15ca9e25075ef0b332419cf08432f019c Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 14 Nov 2019 13:21:12 +0100
Subject: [PATCH 12/26] [benchmark] Refactor: simpler exclude_outliers

---
 benchmark/scripts/compare_perf_tests.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 8e6ff7af3131e..b19bcdb274f2e 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -104,15 +104,11 @@ def exclude_outliers(self, top_only=False):
               bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)))
         hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
 
-        outliers = self.samples[:lo] + self.samples[hi:]
-        samples = self.samples[lo:hi]
-        all = self._all_samples
-
-        self.__init__(self.name, num_iters=self.num_iters)  # re-initialize
-        for sample in samples:  # and
-            self.add(sample)  # re-compute stats
-        self.outliers = outliers
-        self._all_samples = all
+        self.outliers = self.samples[:lo] + self.samples[hi:]
+        self.samples = self.samples[lo:hi]
+        # re-compute stats
+        _, self.mean, self.S_runtime = reduce(
+            self.running_mean_variance, self.samples, (0, 0.0, 0.0))
 
     @property
     def count(self):

From abacc9f4edf63688a662ece37c3658fed21170cc Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 19 Feb 2020 21:17:30 +0100
Subject: [PATCH 13/26] [benchmark] Refactor: remove redundant _runtimes

---
 benchmark/scripts/compare_perf_tests.py | 28 ++++++++++++++-----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index b19bcdb274f2e..3a518a9d2d87e 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -54,14 +54,17 @@ def __init__(self, name, samples=None, num_iters=None):
         """Initialize with benchmark name and optional list of Samples."""
         self.name = name  # Name of the performance test
         self.num_iters = num_iters  # Number of iterations averaged in sample
-        self.samples = []
-        self._all_samples = []
         self.outliers = []
-        self._runtimes = []
         self.mean = 0.0
         self.S_runtime = 0.0  # For computing running variance
-        for sample in samples or []:
-            self.add(sample)
+        if samples:
+            self._all_samples = samples
+            ascending = sorted(samples)
+            self.samples = samples if samples == ascending else ascending
+            self._recompute_stats()
+        else:
+            self.samples = []
+            self._all_samples = []
 
     def __str__(self):
         """Text summary of benchmark statistics."""
@@ -78,8 +81,7 @@ def add(self, sample):
         """Add sample to collection and recompute statistics."""
         assert isinstance(sample, int)
         self._update_stats(sample)
-        i = bisect(self._runtimes, sample)
-        self._runtimes.insert(i, sample)
+        i = bisect(self.samples, sample)
         self.samples.insert(i, sample)
         self._all_samples.append(sample)
 
@@ -88,6 +90,10 @@ def _update_stats(self, sample):
         _, self.mean, self.S_runtime = (
             self.running_mean_variance(old_stats, sample))
 
+    def _recompute_stats(self):
+        _, self.mean, self.S_runtime = reduce(
+            self.running_mean_variance, self.samples, (0, 0.0, 0.0))
+
     def exclude_outliers(self, top_only=False):
         """Exclude outliers by applying Interquartile Range Rule.
 
@@ -101,14 +107,12 @@ def exclude_outliers(self, top_only=False):
         the environment noise caused by preemtive multitasking.
         """
         lo = (0 if top_only else
-              bisect_left(self._runtimes, int(self.q1 - 1.5 * self.iqr)))
-        hi = bisect_right(self._runtimes, int(self.q3 + 1.5 * self.iqr))
+              bisect_left(self.samples, int(self.q1 - 1.5 * self.iqr)))
+        hi = bisect_right(self.samples, int(self.q3 + 1.5 * self.iqr))
 
         self.outliers = self.samples[:lo] + self.samples[hi:]
         self.samples = self.samples[lo:hi]
-        # re-compute stats
-        _, self.mean, self.S_runtime = reduce(
-            self.running_mean_variance, self.samples, (0, 0.0, 0.0))
+        self._recompute_stats()
 
     @property
     def count(self):

From cafb644388ea829d2d4103f4fe4ae586e8d5f07c Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 14 Nov 2019 18:35:17 +0100
Subject: [PATCH 14/26] [benchmark] More immutable PerformanceTestSamples

Removed the ability to add individual samples to PTS.
PerformanceTestSamples are technically not really immutable, because of exclude_outliers.
---
 benchmark/scripts/compare_perf_tests.py      | 19 ++---
 benchmark/scripts/test_compare_perf_tests.py | 74 +++++++++-----------
 2 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 3a518a9d2d87e..92d092da0408f 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -31,7 +31,7 @@ class `ReportFormatter` creates the test comparison report in specified format.
 import argparse
 import re
 import sys
-from bisect import bisect, bisect_left, bisect_right
+from bisect import bisect_left, bisect_right
 from collections import namedtuple
 from math import ceil, sqrt
 
@@ -77,19 +77,6 @@ def __str__(self):
             .format(self) if self.samples else
             '{0.name!s} n=0'.format(self))
 
-    def add(self, sample):
-        """Add sample to collection and recompute statistics."""
-        assert isinstance(sample, int)
-        self._update_stats(sample)
-        i = bisect(self.samples, sample)
-        self.samples.insert(i, sample)
-        self._all_samples.append(sample)
-
-    def _update_stats(self, sample):
-        old_stats = (self.count, self.mean, self.S_runtime)
-        _, self.mean, self.S_runtime = (
-            self.running_mean_variance(old_stats, sample))
-
     def _recompute_stats(self):
         _, self.mean, self.S_runtime = reduce(
             self.running_mean_variance, self.samples, (0, 0.0, 0.0))
@@ -289,7 +276,9 @@ def merge(self, r):
         """
         # Statistics
         if self.samples and r.samples:
-            map(self.samples.add, r.samples.samples)
+            self.samples.samples = sorted(
+                self.samples.samples + r.samples.samples)
+            self.samples._recompute_stats()
             sams = self.samples
             self.num_samples += r.num_samples
             sams.outliers += r.samples.outliers
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index fa4e7c054afd5..63eed97561d81 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -33,8 +33,7 @@
 
 class TestPerformanceTestSamples(unittest.TestCase):
     def setUp(self):
-        self.samples = PerformanceTestSamples('B1')
-        self.samples.add(1000)
+        self.samples = PerformanceTestSamples('B1', [1000])
 
     def test_has_name(self):
         self.assertEqual(self.samples.name, 'B1')
@@ -42,22 +41,29 @@ def test_has_name(self):
     def test_stores_samples(self):
         self.assertEqual(self.samples.count, 1)
         self.assertEqual(self.samples.samples[0], 1000)
+        self.assertEqual(self.samples.samples, [1000])
+        self.assertEqual(self.samples.all_samples, [1000])
+
+        s = PerformanceTestSamples('B1', [1000, 1100, 1050])
+        self.assertEqual(s.samples, [1000, 1050, 1100])  # sorted
+        self.assertEqual(s.all_samples, [1000, 1100, 1050])  # maintains order
 
     def test_num_iters(self):
         self.assertIsNone(self.samples.num_iters)
-        averaged = PerformanceTestSamples('B1', num_iters=42)
+        averaged = PerformanceTestSamples('B', num_iters=42)
         self.assertEqual(averaged.num_iters, 42)
 
     def test_quantile(self):
-        self.assertEqual(self.samples.quantile(1), 1000)
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.samples.add(1100)
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(1), 1100)
-        self.samples.add(1050)
-        self.assertEqual(self.samples.quantile(0), 1000)
-        self.assertEqual(self.samples.quantile(.5), 1050)
-        self.assertEqual(self.samples.quantile(1), 1100)
+        s = PerformanceTestSamples('B1', [1000])
+        self.assertEqual(s.quantile(1), 1000)
+        self.assertEqual(s.quantile(0), 1000)
+        s = PerformanceTestSamples('B1', [1000, 1100])
+        self.assertEqual(s.quantile(0), 1000)
+        self.assertEqual(s.quantile(1), 1100)
+        s = PerformanceTestSamples('B1', [1000, 1100, 1050])
+        self.assertEqual(s.quantile(0), 1000)
+        self.assertEqual(s.quantile(.5), 1050)
+        self.assertEqual(s.quantile(1), 1100)
 
     def assertEqualFiveNumberSummary(self, ss, expected_fns):
         e_min, e_q1, e_median, e_q3, e_max = expected_fns
@@ -68,28 +74,21 @@ def assertEqualFiveNumberSummary(self, ss, expected_fns):
         self.assertEqual(ss.max, e_max)
 
     def test_computes_five_number_summary(self):
-        self.assertEqualFiveNumberSummary(
-            self.samples, (1000, 1000, 1000, 1000, 1000))
-        self.samples.add(1100)
-        self.assertEqualFiveNumberSummary(
-            self.samples, (1000, 1000, 1000, 1100, 1100))
-        self.samples.add(1050)
-        self.assertEqualFiveNumberSummary(
-            self.samples, (1000, 1000, 1050, 1100, 1100))
-        self.samples.add(1025)
-        self.assertEqualFiveNumberSummary(
-            self.samples, (1000, 1000, 1025, 1050, 1100))
-        self.samples.add(1075)
-        self.assertEqualFiveNumberSummary(
-            self.samples, (1000, 1025, 1050, 1075, 1100))
+        s = PerformanceTestSamples('B', [1000])
+        self.assertEqualFiveNumberSummary(s, (1000, 1000, 1000, 1000, 1000))
+        s = PerformanceTestSamples('B', [1000, 1100])
+        self.assertEqualFiveNumberSummary(s, (1000, 1000, 1000, 1100, 1100))
+        s = PerformanceTestSamples('B', [1000, 1100, 1050])
+        self.assertEqualFiveNumberSummary(s, (1000, 1000, 1050, 1100, 1100))
+        s = PerformanceTestSamples('B', [1000, 1100, 1050, 1025])
+        self.assertEqualFiveNumberSummary(s, (1000, 1000, 1025, 1050, 1100))
+        s = PerformanceTestSamples('B', [1000, 1100, 1050, 1025, 1075])
+        self.assertEqualFiveNumberSummary(s, (1000, 1025, 1050, 1075, 1100))
 
     def test_computes_inter_quartile_range(self):
         self.assertEqual(self.samples.iqr, 0)
-        self.samples.add(1025)
-        self.samples.add(1050)
-        self.samples.add(1075)
-        self.samples.add(1100)
-        self.assertEqual(self.samples.iqr, 50)
+        s = PerformanceTestSamples('B', [1000, 1025, 1050, 1075, 1100])
+        self.assertEqual(s.iqr, 50)
 
     def assertEqualStats(self, stats, expected_stats):
         for actual, expected in zip(stats, expected_stats):
@@ -99,7 +98,7 @@ def test_computes_mean_sd_cv(self):
         ss = self.samples
         self.assertEqualStats(
             (ss.mean, ss.sd, ss.cv), (1000.0, 0.0, 0.0))
-        self.samples.add(1100)
+        ss = PerformanceTestSamples('B', [1000, 1100])
         self.assertEqualStats(
             (ss.mean, ss.sd, ss.cv), (1050.0, 70.71, 6.7 / 100))
 
@@ -107,7 +106,7 @@ def test_computes_range_spread(self):
         ss = self.samples
         self.assertEqualStats(
             (ss.range, ss.spread), (0, 0))
-        self.samples.add(1100)
+        ss = PerformanceTestSamples('B', [1000, 1100])
         self.assertEqualStats(
             (ss.range, ss.spread), (100, 10.0 / 100))
 
@@ -122,8 +121,7 @@ def test_init_with_samples(self):
 
     def test_can_handle_zero_runtime(self):
         # guard against dividing by 0
-        self.samples = PerformanceTestSamples('Zero')
-        self.samples.add(0)
+        self.samples = PerformanceTestSamples('Zero', [0])
         self.assertEqualStats(
             (self.samples.mean, self.samples.sd, self.samples.cv,
              self.samples.range, self.samples.spread),
@@ -149,11 +147,7 @@ def test_excludes_outliers(self):
             (self.samples.mean, self.samples.sd), (1050, 35.36))
 
     def test_excludes_outliers_zero_IQR(self):
-        self.samples = PerformanceTestSamples('Tight')
-        self.samples.add(23)
-        self.samples.add(18)
-        self.samples.add(18)
-        self.samples.add(18)
+        self.samples = PerformanceTestSamples('Tight', [23, 18, 18, 18])
         self.assertEqual(self.samples.iqr, 0)
 
         self.samples.exclude_outliers()

From bbdcaf818b9f9d95807adf6f23a65b67c46f2ef4 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 21 Nov 2019 09:35:41 +0100
Subject: [PATCH 15/26] [benchmark] Fix crash in oversampled results

Gracefully handle the parsing of oversampled values in critical configuration, when the sampling error causes an ommision of certain quantiles from the report.
---
 benchmark/scripts/compare_perf_tests.py      |  3 +-
 benchmark/scripts/test_compare_perf_tests.py | 42 +++++++++++++++++---
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 92d092da0408f..99baa359f6c19 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -230,7 +230,8 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False,
                 indices = [max(0, int(ceil(self.num_samples * float(q))) - 1)
                            for q in qs]
                 runtimes = [runtimes[indices.index(i)]
-                            for i in range(0, self.num_samples)]
+                            for i in range(0, self.num_samples)
+                            if i in indices]
 
             self.samples = PerformanceTestSamples(
                 self.name, [int(runtime) for runtime in runtimes])
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 63eed97561d81..4e953fa7ffc37 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -232,12 +232,14 @@ def test_init_oversampled_quantiles(self):
         repeated in the report summary. Samples should contain only true
         values, discarding the repetated artifacts from quantile estimation.
 
-        The test string is slightly massaged output of the following R script:
-        subsample <- function(x, q) {
-          quantile(1:x, probs=((0:(q-1))/(q-1)), type=1)}
-        tbl <- function(s) t(sapply(1:s, function(x) {
-          qs <- subsample(x, s); c(qs[1], diff(qs)) }))
-        sapply(c(3, 5, 11, 21), tbl)
+        The test string is generated by the following R script:
+        subsample <- function(x, q) { quantile(1:x, probs=((0:q)/q), type=1)}
+        drop0s <- function(x) switch(x+1, '', '1')
+        tbl <- function(s) paste0(sapply(1:s, function(x) {
+          qs <- subsample(x, s);
+          paste0(sapply(c(qs[1], diff(qs)), drop0s), collapse=',')
+          }), collapse='\n')
+        cat(paste0(sapply(c(2, 4, 10, 20), tbl), collapse='\n'))
         """
         def validatePTR(deq):  # construct from delta encoded quantiles string
             deq = deq.split(',')
@@ -287,6 +289,34 @@ def validatePTR(deq):  # construct from delta encoded quantiles string
 1,,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"""
         map(validatePTR, delta_encoded_quantiles.split('\n')[1:])
 
+    def test_init_oversampled_percentiles_100_samples(self):
+        """Test that we don't crash parsing result with sampling error.
+
+        Our chosen discontinuous sample quantile type (R's type 1) combined
+        with the imprecise nature of the floating point arithmetic is, in
+        certain cases, causing sampling errors.
+
+        For example when working with percentiles and there are precisely 100
+        samples: the 7th, 14th and 28th samples are skipped and the following
+        samples are reported twice; the 55th sample is skipped (reporting 56th
+        instead) and 57th is reported twice.
+
+        Since the issue is even present in R's quantile function (type=1) as
+        well as our Swift and Python implementations, we will ignore these
+        sampling errors.
+        """
+        dep = ('1,,1,1,1,1,1,2,,1,1,1,1,1,2,,1,1,1,1,1,1,1,1,1,1,1,1,2,,1,1,' +
+               '1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,,1,1,1,1,1' +
+               ',1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1' +
+               ',1,1,1,1,1,1,1,1')
+        dep = dep.split(',')  # delta encoded percentiles
+        self.assertEqual(len(dep), 101)  # min + 99 percentiles + max
+        num_samples = '100'
+        r = PerformanceTestResult(['0', 'B', num_samples] + dep,
+                                  quantiles=True, delta=True)
+        self.assertEqual(r.num_samples, 100)
+        self.assertEqual(r.samples.count, 96)  # missing indexes 7, 14, 28, 55
+
     def test_init_meta(self):
         # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),…
         # …PAGES,ICS,YIELD

From dd2d83d90ec96e7a021b5d6941ebd1f5b1d71b8f Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Thu, 27 Feb 2020 23:09:42 +0100
Subject: [PATCH 16/26] [benchmark] Collate metadata when merging PTRs

Handle optional `--meta` data in `merge` of `PerformanceTestResults`. Pick minimum from memory pages, sum the number of involuntrary contex switches and yield counts.
---
 benchmark/scripts/compare_perf_tests.py      |  9 +++++++-
 benchmark/scripts/test_compare_perf_tests.py | 23 ++++++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 99baa359f6c19..dd028d0383bbe 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -294,12 +294,19 @@ def merge(self, r):
             self.num_samples += r.num_samples
             self.median, self.sd = None, None
 
-        # Metadata
+        # Collate the Metadata
         def minimum(a, b):  # work around None being less than everything
             return (min(filter(lambda x: x is not None, [a, b])) if any([a, b])
                     else None)
+
+        def sum_none(a, b):  # work around None being less than everything
+            return (sum(filter(lambda x: x is not None, [a, b])) if any([a, b])
+                    else None)
         self.max_rss = minimum(self.max_rss, r.max_rss)
+        self.mem_pages = minimum(self.mem_pages, r.mem_pages)
         self.setup = minimum(self.setup, r.setup)
+        self.involuntary_cs = sum_none(self.involuntary_cs, r.involuntary_cs)
+        self.yield_count = sum_none(self.yield_count, r.yield_count)
 
 
 class ResultComparison(object):
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 4e953fa7ffc37..a8e4e0200fa24 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -379,8 +379,6 @@ def test_merge(self):
 1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:]
         results = map(PerformanceTestResult,
                       [line.split(',') for line in tests])
-        results[2].setup = 9
-        results[3].setup = 7
 
         def as_tuple(r):
             return (r.num_samples, r.min, r.max, round(r.mean, 2),
@@ -392,9 +390,11 @@ def as_tuple(r):
         r.merge(results[1])
         self.assertEqual(as_tuple(r),  # drops SD and median, +max_rss
                          (2, 12045, 12325, 12185, None, None, 10510336, None))
+        results[2].setup = 9
         r.merge(results[2])
         self.assertEqual(as_tuple(r),  # picks smaller of the MAX_RSS, +setup
                          (3, 11616, 12325, 11995.33, None, None, 10502144, 9))
+        results[3].setup = 7
         r.merge(results[3])
         self.assertEqual(as_tuple(r),  # picks smaller of the setup values
                          (4, 11616, 12325, 12064, None, None, 10498048, 7))
@@ -417,14 +417,23 @@ def test_merge_with_samples(self):
 
         def as_tuple(r):
             return (r.num_samples, r.samples.num_samples, r.samples.count,
-                    r.min, r.samples.median, r.max)
+                    r.min, r.samples.median, r.max,
+                    r.mem_pages, r.involuntary_cs, r.yield_count)
 
         r = results[0]
-        self.assertEqual(as_tuple(r), (200, 21, 18, 967, 996, 1008))
+        self.assertEqual(as_tuple(r),
+                         (200, 21, 18, 967, 996, 1008, None, None, None))
+        # merging optional --meta data
+        results[1].mem_pages = 9
+        results[1].involuntary_cs = 1
+        results[1].yield_count = 4
         r.merge(results[1])  # 18 + 17 = 35, after merge using only ventiles
-        self.assertEqual(as_tuple(r), (400, 42, 35, 967, 983, 1010))
-        r.merge(results[2])  # 35 + 18 = 53
-        self.assertEqual(as_tuple(r), (600, 63, 53, 967, 989, 1029))
+        self.assertEqual(as_tuple(r), (400, 42, 35, 967, 983, 1010, 9, 1, 4))
+        results[2].mem_pages = 7
+        results[2].involuntary_cs = 2
+        results[2].yield_count = 6
+        r.merge(results[2])  # 35 + 18 = 53; sum yields and context switches
+        self.assertEqual(as_tuple(r), (600, 63, 53, 967, 989, 1029, 7, 3, 10))
 
 
 class TestResultComparison(unittest.TestCase):

From 7d25484f79edcf0574f8c96a5f7d5f0f318ab1e4 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Fri, 28 Feb 2020 00:46:10 +0100
Subject: [PATCH 17/26] [benchmark] Keep merged independent run samples

When merging `PerformanceTestResults`s keep the original `PerformanceTestSample`s from all independent runs. These will be used to choose the most stable (least variable) location estimate for the `ResultComparison` down the road.
---
 benchmark/scripts/compare_perf_tests.py      | 17 ++++++++++++-----
 benchmark/scripts/test_compare_perf_tests.py | 14 ++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index dd028d0383bbe..434ea34b07726 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -113,7 +113,7 @@ def num_samples(self):
 
     @property
     def all_samples(self):
-        """List of all samples in ascending order."""
+        """List of all samples in original order."""
         return self._all_samples
 
     @property
@@ -236,6 +236,7 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False,
             self.samples = PerformanceTestSamples(
                 self.name, [int(runtime) for runtime in runtimes])
             self.samples.exclude_outliers(top_only=True)
+            self.independent_runs = [self.samples]
             sams = self.samples
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
@@ -277,12 +278,18 @@ def merge(self, r):
         """
         # Statistics
         if self.samples and r.samples:
-            self.samples.samples = sorted(
-                self.samples.samples + r.samples.samples)
-            self.samples._recompute_stats()
+            if hasattr(self, 'independent_runs'):
+                self.independent_runs.append(r.samples)
+            else:
+                self.independent_runs = [self.samples, r.samples]
+            outliers = self.samples.outliers + r.samples.outliers
+            all_samples = self.samples.all_samples + r.samples.all_samples
+            self.samples = PerformanceTestSamples(
+                self.name, sorted(self.samples.samples + r.samples.samples))
             sams = self.samples
             self.num_samples += r.num_samples
-            sams.outliers += r.samples.outliers
+            sams.outliers = outliers
+            sams._all_samples = all_samples
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
         else:
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index a8e4e0200fa24..843ad76ea9f5d 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -423,18 +423,29 @@ def as_tuple(r):
         r = results[0]
         self.assertEqual(as_tuple(r),
                          (200, 21, 18, 967, 996, 1008, None, None, None))
+        self.assertEqual(len(r.independent_runs), 1)
+        self.assertEqual(r.independent_runs[0], r.samples)
+
         # merging optional --meta data
         results[1].mem_pages = 9
         results[1].involuntary_cs = 1
         results[1].yield_count = 4
         r.merge(results[1])  # 18 + 17 = 35, after merge using only ventiles
         self.assertEqual(as_tuple(r), (400, 42, 35, 967, 983, 1010, 9, 1, 4))
+
         results[2].mem_pages = 7
         results[2].involuntary_cs = 2
         results[2].yield_count = 6
         r.merge(results[2])  # 35 + 18 = 53; sum yields and context switches
         self.assertEqual(as_tuple(r), (600, 63, 53, 967, 989, 1029, 7, 3, 10))
 
+        self.assertEqual(len(r.samples.all_samples), 63)
+        self.assertEqual(r.samples.outliers, [1019, 1095, 2922, 1040, 1186,
+                                              1880, 6470, 1057, 1281, 4183])
+        self.assertEqual(len(r.independent_runs), 3)
+        self.assertEqual([i.count for i in r.independent_runs], [18, 17, 18])
+        self.assertEqual([i.min for i in r.independent_runs], [967, 972, 986])
+
 
 class TestResultComparison(unittest.TestCase):
     def setUp(self):
@@ -744,6 +755,9 @@ def test_results_from_merge_verbose(self):
         samples = result.samples
         self.assertTrue(isinstance(samples, PerformanceTestSamples))
         self.assertEqual(samples.count, 8)
+        self.assertEqual(
+            samples.all_samples,
+            [355883, 358817, 353552, 350815, 363094, 369169, 376131, 364245])
 
     def test_excludes_outliers_from_samples(self):
         verbose_log = """Running DropFirstAnySeqCntRangeLazy for 10 samples.

From 3f93e3145aed2432e44080823e43300464ca51e5 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Sat, 29 Feb 2020 18:46:52 +0100
Subject: [PATCH 18/26] [benchmark] Override all_samples on merged results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To save on memory used by merged `PerformanceTestResult`s, the rarely used `PerformanceTestSample.all_samples` can gather the samples on demand from the result’s `independent_runs` instead of keeping another copy.
---
 benchmark/scripts/compare_perf_tests.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 434ea34b07726..15dd58ba8dadd 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -58,6 +58,7 @@ def __init__(self, name, samples=None, num_iters=None):
         self.mean = 0.0
         self.S_runtime = 0.0  # For computing running variance
         if samples:
+            self._override_all_samples = None
             self._all_samples = samples
             ascending = sorted(samples)
             self.samples = samples if samples == ascending else ascending
@@ -114,7 +115,8 @@ def num_samples(self):
     @property
     def all_samples(self):
         """List of all samples in original order."""
-        return self._all_samples
+        return (self._all_samples if not self._override_all_samples else
+                self._override_all_samples())
 
     @property
     def min(self):
@@ -282,14 +284,19 @@ def merge(self, r):
                 self.independent_runs.append(r.samples)
             else:
                 self.independent_runs = [self.samples, r.samples]
+
             outliers = self.samples.outliers + r.samples.outliers
-            all_samples = self.samples.all_samples + r.samples.all_samples
             self.samples = PerformanceTestSamples(
                 self.name, sorted(self.samples.samples + r.samples.samples))
             sams = self.samples
+
+            def all_samples():
+                return [s for samples in self.independent_runs
+                        for s in samples.all_samples]
+
+            sams._override_all_samples = all_samples
             self.num_samples += r.num_samples
             sams.outliers = outliers
-            sams._all_samples = all_samples
             self.min, self.max, self.median, self.mean, self.sd = \
                 sams.min, sams.max, sams.median, sams.mean, sams.sd
         else:

From 85260cab7a8a3de11ac8e04bbb044f4883d9374d Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Sun, 1 Mar 2020 21:31:34 +0100
Subject: [PATCH 19/26] [benchmark] Dubious indicator for changes only

Display the `(?)` indicator for dubious result only for changes, never for the unchanged results.
Refactored `ResultComparison.init` with simplyfied range check.
---
 benchmark/scripts/compare_perf_tests.py      | 50 ++++++++++++--------
 benchmark/scripts/test_compare_perf_tests.py |  8 ++++
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 15dd58ba8dadd..4939cd86e37fe 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -329,24 +329,24 @@ class ResultComparison(object):
     It computes speedup ratio and improvement delta (%).
     """
 
-    def __init__(self, old, new):
+    def __init__(self, old_result, new_result):
         """Initialize with old and new `PerformanceTestResult`s to compare."""
-        self.old = old
-        self.new = new
-        assert old.name == new.name
-        self.name = old.name  # Test name, convenience accessor
+        self.old = old_result
+        self.new = new_result
+        assert old_result.name == new_result.name
+        self.name = old_result.name  # Test name, convenience accessor
 
-        # Speedup ratio
-        self.ratio = (old.min + 0.001) / (new.min + 0.001)
+        # Location estimates + "epsilon" to prevent division by 0
+        old = old_result.min + 0.001
+        new = new_result.min + 0.001
 
-        # Test runtime improvement in %
-        ratio = (new.min + 0.001) / (old.min + 0.001)
-        self.delta = ((ratio - 1) * 100)
+        self.ratio = old / new  # Speedup ratio
+        self.delta = ((new / old) - 1) * 100  # Test runtime improvement in %
 
-        # Indication of dubious changes: when result's MIN falls inside the
-        # (MIN, MAX) interval of result they are being compared with.
-        self.is_dubious = ((old.min < new.min and new.min < old.max) or
-                           (new.min < old.min and old.min < new.max))
+        # Indication of dubious changes: when results' ranges overlap
+        o_min, o_max, n_min, n_max = \
+            self.old.min, self.old.max, self.new.min, self.new.max
+        self.is_dubious = (o_min <= n_max and n_min <= o_max)
 
 
 class LogParser(object):
@@ -695,11 +695,16 @@ def format_columns(r, is_strong):
             return (r if not is_strong else
                     r[:-1] + (bold_first(r[-1]), ))
 
-        def table(title, results, is_strong=False, is_open=False):
+        def table(title, results, is_strong=False, is_open=False,
+                  mark_dubious=True):
             if not results:
                 return ''
+
+            def dubious(r):
+                return ventile_formatter(r) if mark_dubious else ''
+
             rows = [row(format_columns(
-                ReportFormatter.values(r, ventile_formatter), is_strong))
+                ReportFormatter.values(r, dubious), is_strong))
                 for r in results]
             table = (header(title if self.single_table else '',
                             ReportFormatter.header_for(results[0])) +
@@ -712,7 +717,8 @@ def table(title, results, is_strong=False, is_open=False):
             table('Regression', self.comparator.decreased, True, True),
             table('Improvement', self.comparator.increased, True),
             ('' if self.changes_only else
-             table('No Changes', self.comparator.unchanged)),
+             table('No Changes', self.comparator.unchanged,
+                   mark_dubious=False)),
             table('Added', self.comparator.added, is_open=True),
             table('Removed', self.comparator.removed, is_open=True)
         ])
@@ -771,9 +777,12 @@ def row(name, old, new, delta, speedup, speedup_color):
         def header(contents):
             return self.HTML_HEADER_ROW.format(* contents)
 
-        def table(title, results, speedup_color):
+        def table(title, results, speedup_color, mark_dubious=True):
+            def dubious(r):
+                return ' (?)' if mark_dubious else ''
+
             rows = [
-                row(*(ReportFormatter.values(r) + (speedup_color,)))
+                row(*(ReportFormatter.values(r, dubious) + (speedup_color,)))
                 for r in results
             ]
             return ('' if not rows else
@@ -786,7 +795,8 @@ def table(title, results, speedup_color):
                 table('Regression', self.comparator.decreased, 'red'),
                 table('Improvement', self.comparator.increased, 'green'),
                 ('' if self.changes_only else
-                 table('No Changes', self.comparator.unchanged, 'black')),
+                 table('No Changes', self.comparator.unchanged, 'black',
+                       mark_dubious=False)),
                 table('Added', self.comparator.added, ''),
                 table('Removed', self.comparator.removed, '')
             ]))
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index 843ad76ea9f5d..ce8fb6000f027 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -941,6 +941,14 @@ def test_emphasize_speedup(self):
                 <td align='left'>10458</td>
                 <td align='left'>+0.0%</td>
                 <td align='left'><font color='black'>1.00x</font></td>
+        </tr>""",
+            """
+        <tr>
+                <td align='left'>ArrayAppend</td>
+                <td align='left'>23641</td>
+                <td align='left'>20000</td>
+                <td align='left'>-15.4%</td>
+                <td align='left'><font color='green'>1.18x (?)</font></td>
         </tr>"""
         ])
 

From 08047e491162e48b125456c7132cb9eca5d9147b Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Sun, 1 Mar 2020 05:15:52 +0100
Subject: [PATCH 20/26] [benchmark] Python 3 Support

---
 benchmark/scripts/Benchmark_Driver           | 30 ++++++++++++--------
 benchmark/scripts/compare_perf_tests.py      | 19 +++++++++----
 benchmark/scripts/test_Benchmark_Driver.py   | 11 ++++---
 benchmark/scripts/test_compare_perf_tests.py | 29 +++++++++----------
 benchmark/scripts/test_utils.py              | 16 +++++++++--
 5 files changed, 64 insertions(+), 41 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 8588cc292e2cc..30ff98c73d6d4 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -34,6 +34,7 @@ import re
 import subprocess
 import sys
 import time
+from functools import reduce
 
 from compare_perf_tests import LogParser
 
@@ -65,7 +66,7 @@ class BenchmarkDriver(object):
 
     def _invoke(self, cmd):
         return self._subprocess.check_output(
-            cmd, stderr=self._subprocess.STDOUT)
+            cmd, stderr=self._subprocess.STDOUT, universal_newlines=True)
 
     @property
     def test_harness(self):
@@ -144,7 +145,7 @@ class BenchmarkDriver(object):
             verbose, measure_memory, quantile, gather_metadata)
         output = self._invoke(cmd)
         results = self.parser.results_from_string(output)
-        return results.items()[0][1] if test else results
+        return list(results.items())[0][1] if test else results
 
     def _cmd_run(self, test, num_samples, num_iters, sample_time, min_samples,
                  verbose, measure_memory, quantile, gather_metadata):
@@ -219,9 +220,9 @@ class BenchmarkDriver(object):
             print(format(values))
 
         def result_values(r):
-            return map(str, [r.test_num, r.name, r.num_samples, r.min,
-                             r.samples.q1, r.median, r.samples.q3, r.max,
-                             r.max_rss])
+            return [str(value) for value in
+                    [r.test_num, r.name, r.num_samples, r.min,
+                     r.samples.q1, r.median, r.samples.q3, r.max, r.max_rss]]
 
         header = ['#', 'TEST', 'SAMPLES', 'MIN(μs)', 'Q1(μs)', 'MEDIAN(μs)',
                   'Q3(μs)', 'MAX(μs)', 'MAX_RSS(B)']
@@ -303,7 +304,11 @@ class MarkdownReportHandler(logging.StreamHandler):
         msg = self.format(record)
         stream = self.stream
         try:
-            if (isinstance(msg, unicode) and
+            unicode_type = unicode  # Python 2
+        except NameError:
+            unicode_type = str  # Python 3
+        try:
+            if (isinstance(msg, unicode_type) and
                     getattr(stream, 'encoding', None)):
                 stream.write(msg.encode(stream.encoding))
             else:
@@ -415,10 +420,10 @@ class BenchmarkDoctor(object):
         setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
         setup = 0 if ratio < 0.05 else setup
         runtime = min(
-            [(result.samples.min - correction) for i_series in
-             [BenchmarkDoctor._select(measurements, num_iters=i)
-              for correction in [(setup / i) for i in [1, 2]]
-              ] for result in i_series])
+            [(result.samples.min - correction) for correction, i_series in
+             [(correction, BenchmarkDoctor._select(measurements, num_iters=i))
+              for i, correction in [(i, setup // i) for i in [1, 2]]]
+             for result in i_series])
 
         threshold = 1000
         if threshold < runtime:
@@ -473,7 +478,7 @@ class BenchmarkDoctor(object):
 
     @staticmethod
     def _reasonable_setup_time(measurements):
-        setup = min([result.setup
+        setup = min([result.setup or 0
                      for result in BenchmarkDoctor._select(measurements)])
         if 200000 < setup:  # 200 ms
             BenchmarkDoctor.log_runtime.error(
@@ -537,7 +542,7 @@ class BenchmarkDoctor(object):
 
         def capped(s):
             return min(s, 200)
-        run_args = [(capped(num_samples), 1), (capped(num_samples / 2), 2)]
+        run_args = [(capped(num_samples), 1), (capped(num_samples // 2), 2)]
         opts = self.driver.args.optimization
         opts = opts if isinstance(opts, list) else [opts]
         self.log.debug(
@@ -691,6 +696,7 @@ def parse_args(args):
     subparsers = parser.add_subparsers(
         title='Swift benchmark driver commands',
         help='See COMMAND -h for additional arguments', metavar='COMMAND')
+    subparsers.required = True
 
     shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
     benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index 4939cd86e37fe..f2a1af460cf76 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # ===--- compare_perf_tests.py -------------------------------------------===//
@@ -33,6 +33,7 @@ class `ReportFormatter` creates the test comparison report in specified format.
 import sys
 from bisect import bisect_left, bisect_right
 from collections import namedtuple
+from functools import reduce
 from math import ceil, sqrt
 
 
@@ -164,13 +165,14 @@ def sd(self):
                 sqrt(self.S_runtime / (self.count - 1)))
 
     @staticmethod
-    def running_mean_variance((k, M_, S_), x):
+    def running_mean_variance(stats, x):
         """Compute running variance, B. P. Welford's method.
 
         See Knuth TAOCP vol 2, 3rd edition, page 232, or
         https://www.johndcook.com/blog/standard_deviation/
         M is mean, Standard Deviation is defined as sqrt(S/k-1)
         """
+        (k, M_, S_) = stats
         k = float(k + 1)
         M = M_ + (x - M_) / k
         S = S_ + (x - M_) * (x - M)
@@ -662,7 +664,7 @@ def _column_widths(self):
         def max_widths(maximum, widths):
             return map(max, zip(maximum, widths))
 
-        return reduce(max_widths, widths, [0] * 4)
+        return list(reduce(max_widths, widths, [0] * 4))
 
     def _formatted_text(self, label_formatter, ventile_formatter,
                         COLUMN_SEPARATOR, DELIMITER_ROW, SEPARATOR, SECTION):
@@ -679,7 +681,8 @@ def row(contents):
 
         def header(title, column_labels):
             labels = (column_labels if not self.single_table else
-                      map(label_formatter, (title, ) + column_labels[1:]))
+                      [label_formatter(c)
+                       for c in (title, ) + column_labels[1:]])
             h = (('' if not self.header_printed else SEPARATOR) +
                  row(labels) +
                  (row(DELIMITER_ROW) if not self.header_printed else ''))
@@ -852,8 +855,12 @@ def main():
     print(report)
 
     if args.output:
-        with open(args.output, 'w') as f:
-            f.write(report)
+        if sys.version_info < (3, 0):
+            with open(args.output, 'w') as f:
+                f.write(report)
+        else:
+            with open(args.output, 'w', encoding='utf-8') as f:
+                f.write(report)
 
 
 if __name__ == '__main__':
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 552dec85481cd..08479d353afab 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # ===--- test_Benchmark_Driver.py ----------------------------------------===//
@@ -17,12 +17,11 @@
 import os
 import time
 import unittest
-from StringIO import StringIO
-from imp import load_source
 
 from compare_perf_tests import PerformanceTestResult
 
 from test_utils import Mock, MockLoggingHandler, Stub, captured_output
+from test_utils import StringIO, load_source
 
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
 Benchmark_Driver = load_source(
@@ -45,7 +44,7 @@ def assert_contains(self, texts, output):
     def test_requires_command_argument(self):
         with captured_output() as (_, err):
             self.assertRaises(SystemExit, parse_args, [])
-        self.assert_contains(['usage:', 'COMMAND', 'too few arguments'],
+        self.assert_contains(['usage:', 'COMMAND', 'error:', 'arguments'],
                              err.getvalue())
 
     def test_command_help_lists_commands(self):
@@ -150,7 +149,7 @@ def __init__(self, responses=None):
         super(SubprocessMock, self).__init__(responses)
 
         def _check_output(args, stdin=None, stdout=None, stderr=None,
-                          shell=False):
+                          shell=False, universal_newlines=False):
             return self.record_and_respond(args, stdin, stdout, stderr, shell)
         self.check_output = _check_output
 
@@ -387,7 +386,7 @@ def test_log_results(self):
         def assert_log_written(out, log_file, content):
             self.assertEqual(out.getvalue(),
                              'Logging results to: ' + log_file + '\n')
-            with open(log_file, 'rU') as f:
+            with open(log_file, 'r') as f:
                 text = f.read()
             self.assertEqual(text, "formatted output")
 
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index ce8fb6000f027..cad8d2e932d62 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # ===--- test_compare_perf_tests.py --------------------------------------===//
@@ -377,8 +377,7 @@ def test_merge(self):
 1,AngryPhonebook,1,12325,12325,12325,0,12325,10510336
 1,AngryPhonebook,1,11616,11616,11616,0,11616,10502144
 1,AngryPhonebook,1,12270,12270,12270,0,12270,10498048""".split('\n')[1:]
-        results = map(PerformanceTestResult,
-                      [line.split(',') for line in tests])
+        results = [PerformanceTestResult(line.split(',')) for line in tests]
 
         def as_tuple(r):
             return (r.num_samples, r.min, r.max, round(r.mean, 2),
@@ -524,17 +523,13 @@ class OldAndNewLog(unittest.TestCase):
 
     old_results = dict([(r.name, r)
                         for r in
-                        map(PerformanceTestResult,
-                            [line.split(',')
-                             for line in
-                             old_log_content.splitlines()])])
+                        [PerformanceTestResult(line.split(','))
+                         for line in old_log_content.splitlines()]])
 
     new_results = dict([(r.name, r)
                         for r in
-                        map(PerformanceTestResult,
-                            [line.split(',')
-                             for line in
-                             new_log_content.splitlines()])])
+                        [PerformanceTestResult(line.split(','))
+                         for line in new_log_content.splitlines()]])
 
     old_results['D'] = PerformanceTestResult(
         '184,D,200,648,4,1,5,9,5,3,45,40,3,1,,,,1,1,,4,4,4,268'.split(','),
@@ -721,7 +716,7 @@ def test_results_from_merge(self):
         concatenated_logs = """4,ArrayAppend,20,23641,29000,24990,0,24990
 4,ArrayAppend,1,20000,20000,20000,0,20000"""
         results = LogParser.results_from_string(concatenated_logs)
-        self.assertEqual(results.keys(), ['ArrayAppend'])
+        self.assertEqual(list(results.keys()), ['ArrayAppend'])
         result = results['ArrayAppend']
         self.assertTrue(isinstance(result, PerformanceTestResult))
         self.assertEqual(result.min, 20000)
@@ -743,7 +738,7 @@ def test_results_from_merge_verbose(self):
     Sample 3,364245
 3,Array2D,4,363094,376131,368159,5931,369169"""
         results = LogParser.results_from_string(concatenated_logs)
-        self.assertEqual(results.keys(), ['Array2D'])
+        self.assertEqual(list(results.keys()), ['Array2D'])
         result = results['Array2D']
         self.assertTrue(isinstance(result, PerformanceTestResult))
         self.assertEqual(result.min, 350815)
@@ -1135,8 +1130,12 @@ def execute_main_with_format(self, report_format, test_output=False):
         report_out = out.getvalue()
 
         if test_output:
-            with open(report_file, 'r') as f:
-                report = f.read()
+            if sys.version_info < (3, 0):
+                with open(report_file, 'r') as f:
+                    report = f.read()
+            else:
+                with open(report_file, 'r', encoding='utf-8') as f:
+                    report = f.read()
             # because print adds newline, add one here, too:
             report_file = str(report + '\n')
         else:
diff --git a/benchmark/scripts/test_utils.py b/benchmark/scripts/test_utils.py
index 6a2bf8856a99f..150928f0aa1c4 100644
--- a/benchmark/scripts/test_utils.py
+++ b/benchmark/scripts/test_utils.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
 # ===--- test_utils.py ---------------------------------------------------===//
@@ -24,9 +24,21 @@
 
 import logging
 import sys
-from StringIO import StringIO
+
+# Cross-version compatibility layer
+try:
+    from StringIO import StringIO  # for Python 2
+except ImportError:
+    from io import StringIO  # for Python 3
 from contextlib import contextmanager
 
+if sys.version_info < (3, 4):  # imp.load_source is deprecated in Python 3.4
+    from imp import load_source
+else:
+    def load_source(name, path):
+        from importlib.machinery import SourceFileLoader
+        return SourceFileLoader(name, path).load_module()
+
 
 @contextmanager
 def captured_output():

From bf06df68ec0b595e1000ae2c92990ab43772ba03 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Tue, 3 Mar 2020 18:52:29 +0100
Subject: [PATCH 21/26] [Gardening] Refactored runtimes correction for SO

Replaced list comprehension that computes the minimum from runtimes corrected for setup overhead with a procedural style that is easier to understand.
---
 benchmark/scripts/Benchmark_Driver | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index 30ff98c73d6d4..fbb8d326a7dc7 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -419,11 +419,12 @@ class BenchmarkDoctor(object):
         name = measurements['name']
         setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
         setup = 0 if ratio < 0.05 else setup
-        runtime = min(
-            [(result.samples.min - correction) for correction, i_series in
-             [(correction, BenchmarkDoctor._select(measurements, num_iters=i))
-              for i, correction in [(i, setup // i) for i in [1, 2]]]
-             for result in i_series])
+        runtimes = []
+        for i in [1, 2]:
+            correction = setup // i
+            for result in BenchmarkDoctor._select(measurements, num_iters=i):
+                runtimes.append(result.samples.min - correction)
+        runtime = min(runtimes)
 
         threshold = 1000
         if threshold < runtime:

From 7c5cead1004624e5b61995d90dceb8baf5e3152d Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Tue, 3 Mar 2020 22:10:50 +0100
Subject: [PATCH 22/26] [benchmark] Compare with stable location estimate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the most stable location estimate (MIN, P05, P10, Q1, MED) for `ResultComparison` based on the standard deviation for given quantiles across the independent runs and the aggregate sample (hardening against outlier runs). We multiply the standard deviations for each quantile between the OLD and NEW results to get “cross-sample” variance. We pick the quantile with lowest variance as most stable and use it for the comparisons.
---
 benchmark/scripts/compare_perf_tests.py      | 50 +++++++++--
 benchmark/scripts/test_compare_perf_tests.py | 91 ++++++++++++++++++++
 2 files changed, 136 insertions(+), 5 deletions(-)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index f2a1af460cf76..b14361fc11617 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -338,18 +338,58 @@ def __init__(self, old_result, new_result):
         assert old_result.name == new_result.name
         self.name = old_result.name  # Test name, convenience accessor
 
-        # Location estimates + "epsilon" to prevent division by 0
-        old = old_result.min + 0.001
-        new = new_result.min + 0.001
+        self.find_most_stable_location_estimates()
+
+        # To avoid division by 0 in ratios, adjust the values by "epsilon"
+        old = self.old_location + 0.001
+        new = self.new_location + 0.001
 
         self.ratio = old / new  # Speedup ratio
         self.delta = ((new / old) - 1) * 100  # Test runtime improvement in %
 
         # Indication of dubious changes: when results' ranges overlap
         o_min, o_max, n_min, n_max = \
-            self.old.min, self.old.max, self.new.min, self.new.max
+            old_result.min, old_result.max, new_result.min, new_result.max
         self.is_dubious = (o_min <= n_max and n_min <= o_max)
 
+    def find_most_stable_location_estimates(self):
+        def independent_runs(result):
+            return (result.independent_runs
+                    if (hasattr(result, 'independent_runs') and
+                        len(result.independent_runs) > 1) else None)
+
+        old_runs = independent_runs(self.old)
+        new_runs = independent_runs(self.new)
+
+        if (old_runs is None or new_runs is None):
+            self.location = 'MIN'
+            self.old_location = self.old.min
+            self.new_location = self.new.min
+            return
+
+        locations = ['MIN', 'P05', 'P10', 'Q1', 'MED']
+        quantiles = [0.0, 0.05, 0.1, 0.25, 0.5]
+
+        # Mix in the aggregated samples to bias selection against outlier runs
+        old_runs += [self.old.samples]
+        new_runs += [self.new.samples]
+
+        def q_sd(runs):
+            """Compute standard deviation for given quantile across runs."""
+            # Adjust σ by 1 to always pick smaller σ in case one of variance
+            # factors is less than 1 or a 0. The multiplicative identity is 1.
+            return [PerformanceTestSamples('', values).sd + 1
+                    for values in [[run.quantile(q) for run in runs]
+                                   for q in quantiles]]
+
+        self.location, most_stable_location, _, _ = \
+            min(zip(locations, quantiles, q_sd(old_runs), q_sd(new_runs)),
+                # Compute "cross-sample" variance: σ₁×σ₂
+                key=lambda t: t[2] * t[3])
+
+        self.old_location = self.old.samples.quantile(most_stable_location)
+        self.new_location = self.new.samples.quantile(most_stable_location)
+
 
 class LogParser(object):
     """Converts log outputs into `PerformanceTestResult`s.
@@ -583,7 +623,7 @@ def values(result, dubious_formatter=lambda r: ' (?)'):
             if isinstance(result, PerformanceTestResult) else
             # isinstance(result, ResultComparison)
             (result.name,
-             str(result.old.min), str(result.new.min),
+             str(result.old_location), str(result.new_location),
              '{0:+.1f}%'.format(result.delta),
              '{0:.2f}x{1}'.format(
                  result.ratio,
diff --git a/benchmark/scripts/test_compare_perf_tests.py b/benchmark/scripts/test_compare_perf_tests.py
index cad8d2e932d62..e36c1e55aa3de 100644
--- a/benchmark/scripts/test_compare_perf_tests.py
+++ b/benchmark/scripts/test_compare_perf_tests.py
@@ -463,6 +463,9 @@ def test_init(self):
         self.assertAlmostEqual(rc.ratio, 12325.0 / 11616.0)
         self.assertAlmostEqual(rc.delta, (((11616.0 / 12325.0) - 1) * 100),
                                places=3)
+        self.assertEqual(rc.location, 'MIN')
+        self.assertEqual([rc.old_location, rc.new_location],
+                         [rc.old.min, rc.new.min])
         # handle test results that sometimes change to zero, when compiler
         # optimizes out the body of the incorrectly written test
         rc = ResultComparison(self.r0, self.r0)
@@ -489,6 +492,94 @@ def test_values_is_dubious(self):
         # other way around: old.min < new.min < old.max
         self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
 
+    def test_use_most_stable_location_for_comparison(self):
+        """
+        Select the most stable location estimate (MIN, P05, P10, Q1, MED).
+
+        When the results contain samples from multiple independent runs, use
+        the empirical distribution to select the location estimate with lowest
+        variance and use it in result comparison.
+        """
+        def compare(log):
+            results = [
+                PerformanceTestResult(l.split(','), quantiles=True, delta=True)
+                for l in log.split('\n')[1:-1]]
+            results[0].merge(results[1])
+            results[2].merge(results[3])
+            return ResultComparison(results[0], results[2])
+
+        # --quantile=20 --delta FlattenListFlatMap
+        rc = compare("""
+0,F,21,3721,147,1689,83,44,138,95,6,3,19,19,26,23,45,64,88,84,34,56,233,1
+0,F,21,5284,16,560,23,25,15,29,13,6,4,19,16,15,26,32,31,72,23,26,104,1
+0,F,21,5318,82,507,25,13,23,12,4,9,25,23,27,14,27,13,6,8,38,89,265,1
+0,F,21,4701,12,1151,28,13,32,5,11,26,9,33,16,28,14,13,46,185,126,16,64,1
+""")
+        self.assertTrue(rc.is_dubious)
+        self.assertEqual([rc.old.min, rc.new.min], [3721, 4701])
+        self.assertEqual([rc.old.median, rc.new.median], [5975, 5988])
+        self.assertEqual(rc.location, 'MED')
+        self.assertEqual([rc.old_location, rc.new_location], [5975, 5988])
+        # self.assertAlmostEqual(rc.delta, 26.34, places=2)  # delta from MIN
+        self.assertAlmostEqual(rc.delta, 0.22, places=2)  # delta from MED
+
+        # --quantile=20 --delta ObjectiveCBridgeStubToNSDateRef O
+        rc = compare("""
+0,O,21,128,8,1,,4,3,3,2,,,3,,1,,,,3,,1,2,1
+0,O,21,119,16,2,,3,3,1,5,,,,1,2,1,,2,1,1,,2,1
+0,O,21,125,7,5,,1,5,1,3,2,,,,2,1,1,1,2,1,1,2,1
+0,O,21,119,17,,1,,2,5,4,1,,,,1,1,,1,,,3,3,1
+""")
+        self.assertTrue(rc.is_dubious)
+        self.assertEqual([rc.old.min, rc.new.min], [119, 119])
+        self.assertEqual([rc.old.samples.quantile(0.1),
+                          rc.new.samples.quantile(0.1)], [137, 136])
+        self.assertEqual(rc.location, 'P10')
+        self.assertEqual([rc.old_location, rc.new_location], [137, 136])
+        self.assertAlmostEqual(rc.delta, -0.73, places=2)
+
+        # --quantile=20 --delta DictionaryBridgeToObjC_Bridge -Onone
+        rc = compare("""
+0,D,21,15,,,,,,,,,,,,,1,,,,,,,
+0,D,21,15,,,,,,,,,,,,,,,,,,,,
+0,D,21,14,1,,,,,,,,,,,,,,,,,,,
+0,D,21,15,,,,,,,,,,1,,,,,,,1,,,
+""")
+        self.assertTrue(rc.is_dubious)
+        self.assertEqual([rc.old.min, rc.new.min], [15, 14])
+        self.assertEqual([rc.old.samples.quantile(0.05),
+                          rc.new.samples.quantile(0.05)], [15, 15])
+        # self.assertEqual(rc.location, 'P10')
+        self.assertEqual(rc.location, 'P05')
+        self.assertEqual([rc.old_location, rc.new_location], [15, 15])
+
+    def test_stable_location_vs_outlier_runs(self):
+        "Location estimate should be robust in presence of outlier runs."
+        def synth(min):
+            r = ('0,S,21,' + str(min) + ',1,1,,,1,,,,,1,,,,,2,,,,,').split(',')
+            return PerformanceTestResult(r, quantiles=True, delta=True)
+
+        s, t, u = synth(100), synth(100), synth(100)
+        self.assertEqual(
+            [s.min, s.samples.quantile(0.05), s.samples.quantile(0.1),
+             s.samples.q1, s.median, s.samples.q3],
+            [100, 101, 102, 103, 104, 106])
+
+        [s.merge(synth(100)) for i in range(1, 11)]
+        [t.merge(synth(100)) for i in range(1, 10)]
+        [u.merge(synth(100)) for i in range(1, 9)]
+        t.merge(synth(94))  # one outlier run
+        u.merge(synth(94))  # two outlier runs
+        u.merge(synth(94))
+
+        rst = ResultComparison(s, t)
+        self.assertEqual(rst.location, 'Q1')
+        self.assertEqual([rst.old_location, rst.new_location], [103, 102])
+
+        rsu = ResultComparison(s, u)
+        self.assertEqual(rsu.location, 'MED')
+        self.assertEqual([rsu.old_location, rsu.new_location], [104, 103])
+
 
 class FileSystemIntegration(unittest.TestCase):
     def setUp(self):

From a06cb2cbdc5dec74443c3a06c7398f79d12d7099 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 4 Mar 2020 03:27:02 +0100
Subject: [PATCH 23/26] [benchmark] Benchmark_Driver: faster iterations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the faster benchmarks in the Swift Benchmark Suite, we now don’t need to spend a whole 1 second measuring each one of them. So I’m adjusting the Benchmark_Driver to sample each one for 50 ms and gather up to 102 actual runtime values (percentiles + MIN + MAX). In my tests, for most optmized benchmarks, the sample distribution was still roughly comparable with full second mesurements. It is more important to gather samples from multiple independent runs to cover the possible variations in distribution.
---
 benchmark/scripts/Benchmark_Driver         | 3 ++-
 benchmark/scripts/test_Benchmark_Driver.py | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
index fbb8d326a7dc7..6a37b7b6146bb 100755
--- a/benchmark/scripts/Benchmark_Driver
+++ b/benchmark/scripts/Benchmark_Driver
@@ -184,7 +184,8 @@ class BenchmarkDriver(object):
             return a
 
         return reduce(merge_results,
-                      [self.run(test, num_iters=1, quantile=20,
+                      [self.run(test, num_iters=1, min_samples=10,
+                                sample_time=0.05, quantile=100,
                                 measure_memory=True, gather_metadata=True)
                        for _ in range(self.args.independent_samples)])
 
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
index 08479d353afab..c097e100f7192 100644
--- a/benchmark/scripts/test_Benchmark_Driver.py
+++ b/benchmark/scripts/test_Benchmark_Driver.py
@@ -337,12 +337,13 @@ def test_report_quantiles(self):
             ('/benchmarks/Benchmark_O', 'b', '--quantile=4', '--delta'))
 
     def test_run_benchmark_independent_samples(self):
-        """Extract up to 20 measurements from an independent run."""
+        """Extract up to 100 measurements from an independent run."""
         self.driver.args.independent_samples = 3
         r = self.driver.run_independent_samples('b1')
         self.assertEqual(self.subprocess_mock.calls.count(
-            ('/benchmarks/Benchmark_O', 'b1', '--num-iters=1', '--memory',
-             '--meta', '--quantile=20', '--delta')), 3)
+            ('/benchmarks/Benchmark_O', 'b1', '--min-samples=10',
+             '--num-iters=1', '--sample-time=0.05',
+             '--memory', '--meta', '--quantile=100', '--delta')), 3)
         self.assertEqual(r.num_samples, 3)  # results are merged
 
     def test_run_and_log(self):

From a8751cc3b23157fa73fffd630e9a8d47c2156db1 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 4 Mar 2020 03:35:50 +0100
Subject: [PATCH 24/26] [benchmark] run_smoke_bench: more robust sampling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the faster benchmarks in Swift Benchmark Suite (now that the Legacy Factor refactoring is finished), we don’t need to be so extremly frugal with the sample count. Gathering just the first 3 to 10 samples per benchmark was not very representative from the statistical point of view. I suspect it hides Type II errors — unreported changes.

Adjusting the measurement method to sample each benchmark for 50 ms and gather at minimum 10 samples. For the suspected changes, gather up to 10 independent samples.

Also thorougly measure the newly added test in re-runs.
---
 benchmark/scripts/run_smoke_bench | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/benchmark/scripts/run_smoke_bench b/benchmark/scripts/run_smoke_bench
index f478c7e95a869..a0a640890b6fe 100755
--- a/benchmark/scripts/run_smoke_bench
+++ b/benchmark/scripts/run_smoke_bench
@@ -96,7 +96,7 @@ def main():
     argparser.add_argument(
         '-num-reruns', type=int,
         help="The number of re-runs until it's assumed to be a real change",
-        default=8)
+        default=10)
     argparser.add_argument(
         '-platform', type=str,
         help='The benchmark build platform', default='macosx')
@@ -153,18 +153,14 @@ def test_opt_levels(args):
 
 
 def measure(driver, tests, i):
-    """Log and measure samples of the tests with the given driver.
-
-    Collect increasing number of samples, depending on the iteration.
-    """
-    num_samples = min(i + 3, 10)
-    msg = '    Iteration {0} for {1}: num samples = {2}, '.format(
-        i, driver.args.tests, num_samples)
+    """Log and measure samples of the tests with the given driver."""
+    msg = '    Iteration {0} for {1}:, '.format(i, driver.args.tests)
     msg += ('running all tests' if driver.all_tests == tests else
             're-testing {0} tests'.format(len(tests)))
     log(msg)
     driver.tests = tests
-    return driver.run(num_samples=num_samples, sample_time=0.0025)
+    return driver.run(
+        num_iters=1, min_samples=10, sample_time=0.05, quantile=20)
 
 
 def merge(results, other_results):
@@ -178,18 +174,17 @@ def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
                      num_reruns, output_file):
     """Detect performance changes in benchmarks.
 
-    Start fast with few samples per benchmark and gradually spend more time
-    gathering more precise measurements of the change candidates.
+    Gather more independent measurements of the change candidates.
     """
 
-    i, unchanged_length_count = 0, 0
+    i, run_count = 0, 0
     old, new = [BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
                 for dir in [old_dir, new_dir]]
     results = [measure(driver, driver.tests, i) for driver in [old, new]]
     tests = TestComparator(results[0], results[1], threshold)
-    changed = tests.decreased + tests.increased
+    changed = tests.decreased + tests.increased + tests.added
 
-    while len(changed) > 0 and unchanged_length_count < num_reruns:
+    while len(changed) > 0 and run_count < num_reruns:
         i += 1
         if VERBOSE:
             log('        test again: ' + str([test.name for test in changed]))
@@ -197,12 +192,8 @@ def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
                          measure(driver, [test.name for test in changed], i))
                    for the_results, driver in zip(results, [old, new])]
         tests = TestComparator(results[0], results[1], threshold)
-        changed = tests.decreased + tests.increased
-
-        if len(old.tests) == len(changed):
-            unchanged_length_count += 1
-        else:
-            unchanged_length_count = 0
+        changed = tests.decreased + tests.increased + tests.added
+        run_count += 1
 
     log('')
     return report_results("Performance: -" + opt_level, None, None,

From 2a252fb5b45a3aba6039d4226ed1188bddb41500 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 4 Mar 2020 03:39:10 +0100
Subject: [PATCH 25/26] [benchmark] run_smoke_bench cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed unused `num_samples` argument and redundant `run_count` variable.
Also gather metadata (currently unused).
Print the actual threshold in the “How to read the data“ decription.
---
 benchmark/scripts/run_smoke_bench | 36 +++++++++++++++----------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/benchmark/scripts/run_smoke_bench b/benchmark/scripts/run_smoke_bench
index a0a640890b6fe..dcb40cdf23e32 100755
--- a/benchmark/scripts/run_smoke_bench
+++ b/benchmark/scripts/run_smoke_bench
@@ -88,11 +88,9 @@ def main():
         help='In addition to stdout, write the results into a markdown file')
     argparser.add_argument(
         '-threshold', type=float,
-        help='The performance threshold in %% which triggers a re-run',
-        default=5)
-    argparser.add_argument(
-        '-num-samples', type=int,
-        help='The (minimum) number of samples to run', default=3)
+        help='The performance threshold in %% which triggers a re-run'
+             ' (default: 5)',
+        default=5.0)
     argparser.add_argument(
         '-num-reruns', type=int,
         help="The number of re-runs until it's assumed to be a real change",
@@ -123,8 +121,9 @@ def test_opt_levels(args):
         if not args.skip_performance:
             if test_performance(opt_level, args.oldbuilddir[0],
                                 args.newbuilddir[0],
-                                float(args.threshold) / 100, args.num_samples,
-                                args.num_reruns, output_file):
+                                args.threshold / 100,
+                                args.num_reruns,
+                                output_file):
                 changes = True
 
         # There is no point in reporting code size for Onone.
@@ -145,7 +144,7 @@ def test_opt_levels(args):
 
     if output_file:
         if changes:
-            output_file.write(get_info_text())
+            output_file.write(get_info_text(args.threshold))
         else:
             output_file.write("### No performance and code size changes")
         output_file.close()
@@ -154,13 +153,14 @@ def test_opt_levels(args):
 
 def measure(driver, tests, i):
     """Log and measure samples of the tests with the given driver."""
-    msg = '    Iteration {0} for {1}:, '.format(i, driver.args.tests)
+    msg = '    Iteration {0} for {1}: '.format(i, driver.args.tests)
     msg += ('running all tests' if driver.all_tests == tests else
             're-testing {0} tests'.format(len(tests)))
     log(msg)
     driver.tests = tests
     return driver.run(
-        num_iters=1, min_samples=10, sample_time=0.05, quantile=20)
+        num_iters=1, min_samples=10, sample_time=0.05, quantile=20,
+        gather_metadata=True)
 
 
 def merge(results, other_results):
@@ -170,21 +170,20 @@ def merge(results, other_results):
     return results
 
 
-def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
+def test_performance(opt_level, old_dir, new_dir, threshold,
                      num_reruns, output_file):
     """Detect performance changes in benchmarks.
 
     Gather more independent measurements of the change candidates.
     """
-
-    i, run_count = 0, 0
+    i = 0
     old, new = [BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
                 for dir in [old_dir, new_dir]]
     results = [measure(driver, driver.tests, i) for driver in [old, new]]
     tests = TestComparator(results[0], results[1], threshold)
     changed = tests.decreased + tests.increased + tests.added
 
-    while len(changed) > 0 and run_count < num_reruns:
+    while len(changed) > 0 and i < num_reruns:
         i += 1
         if VERBOSE:
             log('        test again: ' + str([test.name for test in changed]))
@@ -193,11 +192,10 @@ def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
                    for the_results, driver in zip(results, [old, new])]
         tests = TestComparator(results[0], results[1], threshold)
         changed = tests.decreased + tests.increased + tests.added
-        run_count += 1
 
     log('')
     return report_results("Performance: -" + opt_level, None, None,
-                          threshold * 1.4, output_file, *results)
+                          threshold, output_file, *results)
 
 
 def report_code_size(opt_level, old_dir, new_dir, platform, output_file):
@@ -259,11 +257,11 @@ def report_results(title, old_lines, new_lines, threshold, output_file,
     return False
 
 
-def get_info_text():
+def get_info_text(threshold):
     text = """
 <details>
   <summary><strong>How to read the data</strong></summary>
-The tables contain differences in performance which are larger than 8% and
+The tables contain differences in performance which are larger than {0}% and
 differences in code size which are larger than 1%.
 
 If you see any unexpected regressions, you should consider fixing the
@@ -279,7 +277,7 @@ performance team (@eeckstein).
 <details>
   <summary><strong>Hardware Overview</strong></summary>
 
-"""
+""".format(threshold)
     po = subprocess.check_output(['system_profiler', 'SPHardwareDataType'])
     for line in po.splitlines():
         selection = ['Model Name',

From 5443fbac7b64f6320630725a102b94154227ff62 Mon Sep 17 00:00:00 2001
From: Pavol Vaskovic <pali@pali.sk>
Date: Wed, 4 Mar 2020 07:28:50 +0100
Subject: [PATCH 26/26] [DNM] Dump quartiles for all independent runs

---
 benchmark/scripts/compare_perf_tests.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmark/scripts/compare_perf_tests.py b/benchmark/scripts/compare_perf_tests.py
index b14361fc11617..502b6d6b86df8 100755
--- a/benchmark/scripts/compare_perf_tests.py
+++ b/benchmark/scripts/compare_perf_tests.py
@@ -685,6 +685,12 @@ def ventiles(samples, QUARTILE):
         v += ventiles(result.old.samples, OLD_QUARTILE)
         v += MIDDLE
         v += ventiles(result.new.samples, NEW_QUARTILE)
+        v += MIDDLE
+        v += MIDDLE.join([ventiles(s, OLD_QUARTILE) for s in
+                          result.old.independent_runs])
+        v += MIDDLE
+        v += MIDDLE.join([ventiles(s, NEW_QUARTILE) for s in
+                          result.new.independent_runs])
         v += END
         return v