Added more measurement info like p50, p90 (#87)

Egor-Krivov · web-flow · commit bd18c2dc459d · 2024-02-13T18:00:18.000+01:00
diff --git a/.github/workflows/execute-test-script.yml b/.github/workflows/execute-test-script.yml
@@ -116,7 +116,7 @@ jobs:
                       URL="--url ${{ secrets.DB_URL }}"
                   fi
 
-                  export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} -v ${URL}"
+                  export DL_BENCH_ARGS="--host ${{ inputs.runner_type }} --compiler ${{ inputs.compiler }} --device ${{ inputs.device }} --tag ${{ inputs.tag }} ${URL}"
 
                   # We mainly want to verify our own backend
                   if [[ ${{ inputs.compiler }} != *torch_mlir* ]]; then
diff --git a/.github/workflows/test-single-config.yml b/.github/workflows/test-single-config.yml
@@ -78,7 +78,6 @@ jobs:
             torch_mlir_repo: ${{ inputs.torch_mlir_repo }}
             torch_mlir_branch: ${{ inputs.torch_mlir_branch }}
             runner_type: ${{ inputs.runner_type }}
-            shutdown_cloud_runner: ${{ inputs.shutdown_cloud_runner }}
             test_script: ${{ matrix.test_script }}
         secrets:
             DB_URL: ${{ secrets.DB_URL }}
diff --git a/dl_bench/cli/launcher.py b/dl_bench/cli/launcher.py
@@ -110,9 +110,6 @@ def parse_args():
     parser.add_argument(
         "-o", "--output", required=False, help="Path to output report file."
     )
-    parser.add_argument(
-        "-v", "--verbose", required=False, action="store_true", help="Verbose mode."
-    )
     parser.add_argument(
         "--skip_verification",
         required=False,
@@ -185,16 +182,17 @@ def main():
 
     db = BenchmarkDb(args.url)
 
-    if args.verbose:
-        print("Report:")
-        print(
-            "TFLOPS: {:.3}".format(
-                results.get("flops_per_sample", 0)
-                * results.get("samples_per_s", 0)
-                / (10**12)
-            )
+    print("Report:")
+    print("FPS: {:.1f}".format(results.get("samples_per_s", 0)))
+    print(
+        "TFLOPS: {:.3}".format(
+            results.get("flops_per_sample", 0)
+            * results.get("samples_per_s", 0)
+            / (10**12)
         )
-        pprint.pprint(report)
+    )
+    pprint.pprint(report)
+    pprint.pprint(results)
 
     if args.output is not None:
         with open(args.output, "w", encoding="UTF-8") as out:
diff --git a/dl_bench/utils.py b/dl_bench/utils.py
@@ -385,54 +385,61 @@ def inference(self, backend: Backend):
         self.compile(sample, backend)
 
         n_items = 0
-
-        self.net.eval()
         outputs = []
         fw_times = []
+
+        self.net.eval()
         with torch.no_grad():
             start = time.perf_counter()
-            # Duration is inconsistent now
-            with tm.timeit("duration_s"):
-                for i, x in enumerate(test_loader):
-                    backend.sync()
-                    s = get_time()
-                    x = backend.to_device(x)
-                    if backend.dtype != torch.float32:
-                        with torch.autocast(
-                            device_type=backend.device_name,
-                            dtype=backend.dtype,
-                        ):
-                            y = self.net(x)
-                    else:
+            for i, x in enumerate(test_loader):
+                backend.sync()
+                s = get_time()
+                x = backend.to_device(x)
+                if backend.dtype != torch.float32:
+                    with torch.autocast(
+                        device_type=backend.device_name,
+                        dtype=backend.dtype,
+                    ):
                         y = self.net(x)
+                else:
+                    y = self.net(x)
 
-                    if i < self.warmup_batches:
-                        start = time.perf_counter()
-                        continue
+                backend.sync()
 
-                    backend.sync()
-                    fw_times.append(get_time() - s)
-                    n_items += len(x)
-                    outputs.append(y)
+                if i < self.warmup_batches:
+                    # We restart timer because that was just a warmup
+                    start = time.perf_counter()
+                    continue
 
-                    # early stopping if we have 10+ batches and were running for 10+ seconds
-                    if (
-                        (time.perf_counter() - start) > self.min_seconds
-                        and n_items >= self.batch_size * self.min_batches
-                    ):
-                        break
+                fw_times.append(get_time() - s)
+                n_items += len(x)
+                outputs.append(y)
+
+                # early stopping if we have 10+ batches and were running for 10+ seconds
+                if (
+                    (time.perf_counter() - start) > self.min_seconds
+                    and n_items >= self.batch_size * self.min_batches
+                ):
+                    break
+
+                if (get_time() - start) > max_time:
+                    break
 
-                    if (get_time() - start) > max_time:
-                        break
+        stop = get_time()
 
         print(
             f"Latency 0%-5%-50%-95%-100% are: {np.percentile(fw_times, [0, 5, 50, 95, 100])}"
         )
 
         results = tm.get_results()
-        results["duration_s"] = get_time() - start
+        results["duration_s"] = stop - start
         results["samples_per_s"] = n_items / sum(fw_times)
+        results["dirty_items_per_s"] = n_items / results["duration_s"]
         results["flops_per_sample"] = self.flops_per_sample
+        results["n_items"] = n_items
+        results["p50"] = np.percentile(fw_times, 50)
+        results["p90"] = np.percentile(fw_times, 90)
+        results["p100"] = max(fw_times)
 
         return results, outputs