Merge branch 'master' into i6662-public-record-filter

DynamoRIO · May 6, 2024 · 82a113b · 82a113b
2 parents bc7e4a0 + 3e1ec2f
commit 82a113b
Show file tree

Hide file tree

Showing 9 changed files with 752 additions and 81 deletions.
diff --git a/api/docs/scatter_gather_emulation.dox b/api/docs/scatter_gather_emulation.dox
diff --git a/clients/drcachesim/analyzer.cpp b/clients/drcachesim/analyzer.cpp
@@ -571,26 +571,34 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_serial(analyzer_worker_data_t &
 template <typename RecordType, typename ReaderType>
 bool
 analyzer_tmpl_t<RecordType, ReaderType>::process_shard_exit(
-    analyzer_worker_data_t *worker, int shard_index)
+    analyzer_worker_data_t *worker, int shard_index, bool do_process_final_interval)
 {
     VPRINT(this, 1, "Worker %d finished trace shard %s\n", worker->index,
            worker->stream->get_stream_name().c_str());
     worker->shard_data[shard_index].exited = true;
-    if ((interval_microseconds_ != 0 || interval_instr_count_ != 0) &&
-        (!process_interval(worker->shard_data[shard_index].cur_interval_index,
-                           worker->shard_data[shard_index].cur_interval_init_instr_count,
-                           worker,
-                           /*parallel=*/true, /*at_instr_record=*/false, shard_index) ||
-         !finalize_interval_snapshots(worker, /*parallel=*/true, shard_index)))
-        return false;
+    if (interval_microseconds_ != 0 || interval_instr_count_ != 0) {
+        if (!do_process_final_interval) {
+            ERRMSG("i#6793: Skipping process_interval for final interval of shard index "
+                   "%d\n",
+                   shard_index);
+        } else if (!process_interval(
+                       worker->shard_data[shard_index].cur_interval_index,
+                       worker->shard_data[shard_index].cur_interval_init_instr_count,
+                       worker,
+                       /*parallel=*/true, /*at_instr_record=*/false, shard_index)) {
+            return false;
+        }
+        if (!finalize_interval_snapshots(worker, /*parallel=*/true, shard_index)) {
+            return false;
+        }
+    }
     for (int i = 0; i < num_tools_; ++i) {
         if (!tools_[i]->parallel_shard_exit(
                 worker->shard_data[shard_index].tool_data[i].shard_data)) {
             worker->error = tools_[i]->parallel_shard_error(
                 worker->shard_data[shard_index].tool_data[i].shard_data);
-            VPRINT(this, 1, "Worker %d hit shard exit error %s on trace shard %s\n",
-                   worker->index, worker->error.c_str(),
-                   worker->stream->get_stream_name().c_str());
+            VPRINT(this, 1, "Worker %d hit shard exit error %s on trace shard index %d\n",
+                   worker->index, worker->error.c_str(), shard_index);
             return false;
         }
     }
@@ -693,9 +701,18 @@ analyzer_tmpl_t<RecordType, ReaderType>::process_tasks_internal(
             }
         }
     }
+    // i#6444: Fallback for cases where there is a missing thread final record in
+    // non-core-sharded traces, in which case we have not yet invoked
+    // process_shard_exit.
     for (const auto &keyval : worker->shard_data) {
         if (!keyval.second.exited) {
-            if (!process_shard_exit(worker, keyval.second.shard_index)) {
+            // i#6793: We skip processing the final interval for shards exited here
+            // if the stream has already moved on and cannot provide the state for
+            // the shard anymore.
+            bool do_process_final_interval =
+                keyval.second.shard_index == worker->stream->get_shard_index();
+            if (!process_shard_exit(worker, keyval.second.shard_index,
+                                    do_process_final_interval)) {
                 return false;
             }
         }

diff --git a/clients/drcachesim/analyzer.h b/clients/drcachesim/analyzer.h
@@ -248,7 +248,8 @@ template <typename RecordType, typename ReaderType> class analyzer_tmpl_t {
     // Helper for process_tasks() which calls parallel_shard_exit() in each tool.
     // Returns false if there was an error and the caller should return early.
     bool
-    process_shard_exit(analyzer_worker_data_t *worker, int shard_index);
+    process_shard_exit(analyzer_worker_data_t *worker, int shard_index,
+                       bool do_process_final_interval = true);
 
     bool
     record_has_tid(RecordType record, memref_tid_t &tid);

diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            4 cores
-           8 threads
+           8 threads: 1257.*
       638938 instructions
            6 total context switches
    0.0093906 CSPKI \(context switches per 1000 instructions\)
@@ -17,6 +17,7 @@ Total counts:
       345686 idles
        64.89% cpu busy by record count
     [0-9 ]* cpu microseconds
+    [0-9 ]* wait microseconds
     [0-9 ]* idle microseconds
     [0-9 ]* idle microseconds at last instr
     [0-9\. ]*% cpu busy by time

diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -1,7 +1,7 @@
 Schedule stats tool results:
 Total counts:
            4 cores
-           8 threads
+           8 threads: 1257.*
       638938 instructions
            6 total context switches
    0\.0093906 CSPKI \(context switches per 1000 instructions\)
@@ -17,12 +17,13 @@ Total counts:
     *[0-9]* idles
       *[0-9\.]*% cpu busy by record count
    *[0-9]* cpu microseconds
+   *[0-9]* wait microseconds
    *[0-9]* idle microseconds
    *[0-9]* idle microseconds at last instr
       *[0-9\.]*% cpu busy by time
       *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #0 counts:
-           . threads
+           . threads: 1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -38,12 +39,13 @@ Core #0 counts:
     *[0-9]* idles
       *[0-9\.]*% cpu busy by record count
    *[0-9]* cpu microseconds
+   *[0-9]* wait microseconds
    *[0-9]* idle microseconds
    *[0-9]* idle microseconds at last instr
       *[0-9\.]*% cpu busy by time
       *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #1 counts:
-           . threads
+           . threads: 1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -59,12 +61,13 @@ Core #1 counts:
     *[0-9]* idles
       *[0-9\.]*% cpu busy by record count
    *[0-9]* cpu microseconds
+   *[0-9]* wait microseconds
    *[0-9]* idle microseconds
    *[0-9]* idle microseconds at last instr
       *[0-9\.]*% cpu busy by time
       *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #2 counts:
-           . threads
+           . threads: 1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -80,12 +83,13 @@ Core #2 counts:
     *[0-9]* idles
       *[0-9\.]*% cpu busy by record count
    *[0-9]* cpu microseconds
+   *[0-9]* wait microseconds
    *[0-9]* idle microseconds
    *[0-9]* idle microseconds at last instr
       *[0-9\.]*% cpu busy by time
       *[0-9\.]*% cpu busy by time, ignoring idle past last instr
 Core #3 counts:
-           . threads
+           . threads: 1257.*
       *[0-9]* instructions
            . total context switches
    0\.0[0-9\.]* CSPKI \(context switches per 1000 instructions\)
@@ -101,6 +105,7 @@ Core #3 counts:
     *[0-9]* idles
       *[0-9\.]*% cpu busy by record count
    *[0-9]* cpu microseconds
+   *[0-9]* wait microseconds
    *[0-9]* idle microseconds
    *[0-9]* idle microseconds at last instr
       *[0-9\.]*% cpu busy by time

diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -50,6 +50,7 @@
 
 namespace dynamorio {
 namespace drmemtrace {
+namespace {
 
 using ::dynamorio::drmemtrace::default_memtrace_stream_t;
 using ::dynamorio::drmemtrace::memref_t;
@@ -60,14 +61,45 @@ using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH;
 using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL;
 using ::dynamorio::drmemtrace::TRACE_MARKER_TYPE_SYSCALL;
 
+// Create a class for testing that has reliable timing.
+// It assumes it is only used with one thread and parallel operation
+// is emulated via lockstep serial walking, so there is no need for locks.
+class mock_schedule_stats_t : public schedule_stats_t {
+public:
+    mock_schedule_stats_t(uint64_t print_every, unsigned int verbose = 0)
+        : schedule_stats_t(print_every, verbose)
+    {
+    }
+
+    uint64_t
+    get_current_microseconds() override
+    {
+        return global_time_;
+    }
+
+    bool
+    parallel_shard_memref(void *shard_data, const memref_t &memref) override
+    {
+        // This global time with our lockstep iteration in run_schedule_stats()
+        // over-counts as it advances while threads are waiting their
+        // serial turn, but that's fine: so long as it's deterministic.
+        ++global_time_;
+        return schedule_stats_t::parallel_shard_memref(shard_data, memref);
+    }
+
+private:
+    // Start at 1 to avoid asserts about a time of 0.
+    uint64_t global_time_ = 1;
+};
+
 // Bypasses the analyzer and scheduler for a controlled test sequence.
 // Alternates the per-core memref vectors in lockstep.
 static schedule_stats_t::counters_t
 run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
 {
     // At verbosity 2+ we'd need to subclass default_memtrace_stream_t
     // and provide a non-null get_input_interface() (point at "this").
-    schedule_stats_t tool(/*print_every=*/1, /*verbosity=*/1);
+    mock_schedule_stats_t tool(/*print_every=*/1, /*verbosity=*/1);
     struct per_core_t {
         void *worker_data;
         void *shard_data;
@@ -143,6 +175,9 @@ test_basic_stats()
             gen_marker(TID_C, TRACE_MARKER_TYPE_TIMESTAMP, 3300),
             // Direct switch requested but failed.
             gen_instr(TID_C),
+            gen_exit(TID_C),
+            // An exit is a voluntary switch.
+            gen_exit(TID_A),
         },
         {
             gen_instr(TID_B),
@@ -160,22 +195,21 @@ test_basic_stats()
             gen_instr(TID_B),
             gen_instr(TID_B),
             gen_instr(TID_B),
+            gen_exit(TID_B),
         },
     };
     auto result = run_schedule_stats(memrefs);
     assert(result.instrs == 16);
-    assert(result.total_switches == 6);
-    assert(result.voluntary_switches == 2);
+    assert(result.total_switches == 7);
+    assert(result.voluntary_switches == 3);
     assert(result.direct_switches == 1);
     assert(result.syscalls == 4);
     assert(result.maybe_blocking_syscalls == 3);
     assert(result.direct_switch_requests == 2);
     assert(result.waits == 3);
     assert(result.idle_microseconds == 0);
-    // XXX: For Windows test VMs we see coarse time updates resulting in 0's.
-#ifndef WIN32
-    assert(result.cpu_microseconds > 0);
-#endif
+    assert(result.cpu_microseconds > 20);
+    assert(result.wait_microseconds >= 3);
     return true;
 }
 
@@ -196,6 +230,7 @@ test_idle()
             gen_instr(TID_B),
             gen_instr(TID_B),
             gen_instr(TID_B),
+            gen_exit(TID_B),
         },
         {
             gen_instr(TID_C),
@@ -217,31 +252,31 @@ test_idle()
             gen_instr(TID_A),
             gen_instr(TID_A),
             gen_instr(TID_A),
+            gen_exit(TID_A),
+            // An exit is a voluntary switch.
+            gen_exit(TID_C),
         },
     };
     auto result = run_schedule_stats(memrefs);
     assert(result.instrs == 13);
-    assert(result.total_switches == 5);
-    assert(result.voluntary_switches == 0);
+    assert(result.total_switches == 6);
+    assert(result.voluntary_switches == 1);
     assert(result.direct_switches == 0);
     assert(result.syscalls == 0);
     assert(result.maybe_blocking_syscalls == 0);
     assert(result.direct_switch_requests == 0);
     assert(result.waits == 3);
     assert(result.idles == 6);
-    // It is hard to test wall-clock time precise values so we have sanity checks.
-    std::cerr << "got idle " << result.idle_microseconds << "us, cpu "
-              << result.cpu_microseconds << "us\n"; // NOCHECK
-    // XXX: For Windows test VMs we see coarse time updates resulting in 0's.
-#ifndef WIN32
-    assert(result.idle_microseconds > 0);
+    assert(result.idle_microseconds >= 6);
     assert(result.idle_micros_at_last_instr > 0 &&
            result.idle_micros_at_last_instr <= result.idle_microseconds);
-    assert(result.cpu_microseconds > 0);
-#endif
+    assert(result.cpu_microseconds > 10);
+    assert(result.wait_microseconds >= 3);
     return true;
 }
 
+} // namespace
+
 int
 test_main(int argc, const char *argv[])
 {