Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i#5694 core-sharded: Add get_shard_index() + get_tid() #6568

Merged
merged 11 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions clients/drcachesim/analysis_tool.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2016-2023 Google, Inc. All rights reserved.
* Copyright (c) 2016-2024 Google, Inc. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -387,7 +387,8 @@ template <typename RecordType> class analysis_tool_tmpl_t {
/**
* Invoked once for each trace shard prior to calling parallel_shard_memref() for
* that shard, this allows a tool to create data local to a shard. The \p
* shard_index is a unique identifier allowing shard data to be stored into a global
* shard_index is the 0-based ordinal of the shard, serving as a unique identifier
* allowing shard data to be stored into a global
* table if desired (typically for aggregation use in print_results()). The \p
* worker_data is the return value of parallel_worker_init() for the worker thread
* who will exclusively operate on this shard. The \p shard_stream allows tools to
Expand Down
7 changes: 4 additions & 3 deletions clients/drcachesim/analyzer.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2016-2023 Google, Inc. All rights reserved.
* Copyright (c) 2016-2024 Google, Inc. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -257,8 +257,9 @@ analyzer_tmpl_t<RecordType, ReaderType>::init_scheduler(
return false;
}
std::vector<typename sched_type_t::input_reader_t> readers;
// With no modifiers or only_threads the tid doesn't matter.
readers.emplace_back(std::move(reader), std::move(reader_end), /*tid=*/1);
// Use a sentinel for the tid so the scheduler will use the memref record tid.
readers.emplace_back(std::move(reader), std::move(reader_end),
/*tid=*/INVALID_THREAD_ID);
std::vector<typename sched_type_t::range_t> regions;
if (skip_instrs_ > 0)
regions.emplace_back(skip_instrs_ + 1, 0);
Expand Down
66 changes: 63 additions & 3 deletions clients/drcachesim/common/memtrace_stream.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2022-2023 Google, Inc. All rights reserved.
* Copyright (c) 2022-2024 Google, Inc. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -155,10 +155,25 @@ class memtrace_stream_t {
return false;
}

/**
* Returns the 0-based ordinal for the current shard. For parallel analysis,
* this equals the \p shard_index passed to parallel_shard_init_stream().
* This is more useful for serial modes where there is no other convenience mechanism
* to determine such an index; there, it equals the same index that would be used in
* parallel mode, allowing a tool to compute per-shard results even in serial mode. If
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
* not implemented, -1 is returned.
*/
virtual int
get_shard_index() const
{
return -1;
}

/**
* Returns a unique identifier for the current "output cpu". Generally this only
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
* applies when using #SHARD_BY_CORE. For dynamic schedules, the identifier is
* typically an output cpu ordinal. For replaying an as-traced schedule, the
* typically an output cpu ordinal equal to get_shard_index(). For replaying an
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
* as-traced schedule, the
* identifier is typically the original input cpu which is now mapped directly
* to this output. If not implemented for the current mode, -1 is returned.
*/
Expand Down Expand Up @@ -192,6 +207,17 @@ class memtrace_stream_t {
return -1;
}

/**
* Returns the thread identifier for the current input trace.
* This is a convenience method for use in parallel_shard_init_stream()
* prior to access to any #memref_t records.
*/
virtual int64_t
get_input_tid() const
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
{
return -1;
}

/**
* Returns the stream interface for the current input trace. This differs from
* "this" for #SHARD_BY_CORE where multiple inputs are interleaved on one
Expand Down Expand Up @@ -284,8 +310,42 @@ class default_memtrace_stream_t : public memtrace_stream_t {
return 0;
}

void
set_output_cpuid(int64_t cpuid)
{
cpuid_ = cpuid;
}
int64_t
get_output_cpuid() const override
{
return cpuid_;
}
void
set_shard_index(int index)
{
shard_ = index;
}
int
get_shard_index() const override
{
return shard_;
}
void
set_input_tid(int64_t tid)
{
tid_ = tid;
}
int64_t
get_input_tid() const override
{
return tid_;
}

private:
uint64_t *record_ordinal_;
uint64_t *record_ordinal_ = nullptr;
int64_t cpuid_ = 0;
int shard_ = 0;
int64_t tid_ = 0;
};

} // namespace drmemtrace
Expand Down
42 changes: 41 additions & 1 deletion clients/drcachesim/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1373,6 +1373,43 @@ scheduler_tmpl_t<RecordType, ReaderType>::get_input_ordinal(output_ordinal_t out
return outputs_[output].cur_input;
}

template <typename RecordType, typename ReaderType>
int64_t
scheduler_tmpl_t<RecordType, ReaderType>::get_input_tid(output_ordinal_t output)
{
int index = outputs_[output].cur_input;
if (index < 0)
return -1;
if (inputs_[index].tid == INVALID_THREAD_ID)
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
return inputs_[index].last_record_tid;
return inputs_[index].tid;
}

template <typename RecordType, typename ReaderType>
int
scheduler_tmpl_t<RecordType, ReaderType>::get_shard_index(output_ordinal_t output)
{
if (output < 0 || output >= static_cast<output_ordinal_t>(outputs_.size()))
return -1;
if (TESTANY(sched_type_t::SCHEDULER_USE_INPUT_ORDINALS |
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
sched_type_t::SCHEDULER_USE_SINGLE_INPUT_ORDINALS,
options_.flags)) {
if (inputs_.size() == 1 && inputs_[0].tid == INVALID_THREAD_ID) {
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
int index;
memref_tid_t tid = get_input_tid(output);
auto exists = tid2shard_.find(tid);
if (exists == tid2shard_.end()) {
index = static_cast<int>(tid2shard_.size());
tid2shard_[tid] = index;
} else
index = exists->second;
return index;
}
return get_input_ordinal(output);
}
return output;
}

template <typename RecordType, typename ReaderType>
int
scheduler_tmpl_t<RecordType, ReaderType>::get_workload_ordinal(output_ordinal_t output)
Expand All @@ -1398,7 +1435,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::is_record_synthetic(output_ordinal_t o

template <typename RecordType, typename ReaderType>
int64_t
scheduler_tmpl_t<RecordType, ReaderType>::get_output_cpuid(output_ordinal_t output)
scheduler_tmpl_t<RecordType, ReaderType>::get_output_cpuid(output_ordinal_t output) const
{
if (options_.replay_as_traced_istream != nullptr)
return outputs_[output].as_traced_cpuid;
Expand Down Expand Up @@ -2575,6 +2612,9 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
VDO(this, 4, print_record(record););

outputs_[output].last_record = record;
if (!record_type_has_tid(record, input->last_record_tid)) {
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
// Leave it as the last value.
}
return sched_type_t::STATUS_OK;
}

Expand Down
69 changes: 65 additions & 4 deletions clients/drcachesim/scheduler/scheduler.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2023 Google, Inc. All rights reserved.
* Copyright (c) 2023-2024 Google, Inc. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -342,6 +342,11 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
* must be specified.
* The original as-traced cpuid that is mapped to each output stream can be
* obtained by calling the get_output_cpuid() function on each stream.
*
* An alternative use of this mapping is with a single output to interleave
* inputs in a strict timestamp order, as with make_scheduler_serial_options(),
* without specifying a schedule file and without recreating core mappings:
* only timestamps are honored.
*/
MAP_TO_RECORDED_OUTPUT,
/**
Expand Down Expand Up @@ -919,6 +924,16 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
return static_cast<int64_t>(get_input_stream_ordinal());
}

/**
* Returns the thread identifier for the current input stream feeding this
* output stream.
*/
int64_t
get_input_tid() const override
{
return scheduler_->get_input_tid(ordinal_);
}

/**
* Returns the #dynamorio::drmemtrace::memtrace_stream_t interface for the
* current input stream feeding this output stream.
Expand All @@ -929,6 +944,33 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
return scheduler_->get_input_stream_interface(get_input_stream_ordinal());
}

/**
* Returns the ordinal for the current output stream. If
* #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t::
* single_lockstep_output
* is set to true, this returns the ordinal of the currently active "inner"
* output stream. Otherwise, this returns the constant ordinal for this output
* stream as there is no concept of inner or outer streams.
*/
output_ordinal_t
get_output_stream_ordinal() const
{
return ordinal_;
}

/**
* For #SCHEDULER_USE_INPUT_ORDINALS or
* #SCHEDULER_USE_SINGLE_INPUT_ORDINALS, returns the input stream ordinal, except
* for the case of a single input with thread id set to INVALID_THREAD_ID in
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
* which case the last trace record's tid is returned; otherwise returns the
* output stream ordinal.
*/
int
get_shard_index() const override
{
return scheduler_->get_shard_index(ordinal_);
}

/**
* Returns whether the current record is from a part of the trace corresponding
* to kernel execution.
Expand Down Expand Up @@ -1016,6 +1058,14 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
return inputs_[input].reader->get_stream_name();
}

/**
* Returns the get_output_cpuid() value for the given output.
* This interface is exported so that a user can get the cpuids statically when using
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
* single_lockstep_output where there is just one stream.
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
*/
int64_t
get_output_cpuid(output_ordinal_t output) const;

/** Returns a string further describing an error code. */
std::string
get_error_string() const
Expand Down Expand Up @@ -1054,6 +1104,7 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
// workload index + tid to identify the original input.
int workload = -1;
memref_tid_t tid = INVALID_THREAD_ID;
memref_tid_t last_record_tid = INVALID_THREAD_ID;
// If non-empty these records should be returned before incrementing the reader.
// This is used for read-ahead and inserting synthetic records.
// We use a deque so we can iterate over it.
Expand Down Expand Up @@ -1380,6 +1431,16 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
input_ordinal_t
get_input_ordinal(output_ordinal_t output);

// Returns the thread identifier for the current input stream scheduled on
// the 'output_ordinal'-th output stream.
int64_t
get_input_tid(output_ordinal_t output);

// Returns the shard index for the current input stream scheduled on
// the 'output_ordinal'-th output stream.
int
get_shard_index(output_ordinal_t output);

// Returns the workload ordinal value for the current input stream scheduled on
// the 'output_ordinal'-th output stream.
int
Expand All @@ -1390,9 +1451,6 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
bool
is_record_synthetic(output_ordinal_t output);

int64_t
get_output_cpuid(output_ordinal_t output);

// Returns the direct handle to the current input stream interface for the
// 'output_ordinal'-th output stream.
memtrace_stream_t *
Expand Down Expand Up @@ -1528,6 +1586,9 @@ template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
switch_sequence_;
// For single_lockstep_output.
std::unique_ptr<stream_t> global_stream_;
// For online where we currently have to map dynamically observed thread ids
// to the 0-based shard index.
std::unordered_map<memref_tid_t, int> tid2shard_;
};

/** See #dynamorio::drmemtrace::scheduler_tmpl_t. */
Expand Down
32 changes: 9 additions & 23 deletions clients/drcachesim/simulator/simulator.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2015-2023 Google, Inc. All rights reserved.
* Copyright (c) 2015-2024 Google, Inc. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -75,7 +75,7 @@ simulator_t::init_knobs(unsigned int num_cores, uint64_t skip_refs, uint64_t war
knob_use_physical_ = use_physical;
knob_verbose_ = verbose;
last_thread_ = 0;
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
last_core_ = 0;
last_core_ = -1;
if (shard_type_ == SHARD_BY_THREAD) {
cpu_counts_.resize(knob_num_cores_, 0);
thread_counts_.resize(knob_num_cores_, 0);
Expand Down Expand Up @@ -240,27 +240,13 @@ int
simulator_t::core_for_thread(memref_tid_t tid)
{
if (shard_type_ == SHARD_BY_CORE) {
int64_t cpu = serial_stream_->get_output_cpuid();
// While the scheduler uses a 0-based ordinal for all but replaying as-traced,
// to handle as-traced (and because the docs for get_output_cpuid() do not
// guarantee 0-based), we map to a 0-based index just by incrementing an index
// as we discover each cpu.
// XXX: Should we add a new stream API for get_output_ordinal()? That would
// be a more faithful mapping than our dynamic discovery here -- although the
// lockstep ordering by the scheduler should have our ordinals in order.
if (cpu == last_cpu_)
return last_core_;
int core;
auto exists = cpu2core_.find(cpu);
if (exists == cpu2core_.end()) {
core = static_cast<int>(cpu2core_.size());
cpu2core_[cpu] = core;
if (knob_verbose_ >= 1) {
std::cerr << "new cpu " << cpu << " => core " << core << "\n";
}
} else
core = exists->second;
last_cpu_ = cpu;
int core = serial_stream_->get_shard_index();
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
if (core != last_core_) {
// Track the cpuid<->ordinal relationship for our results printout.
int64_t cpu = serial_stream_->get_output_cpuid();
if (cpu2core_.find(cpu) == cpu2core_.end())
cpu2core_[cpu] = core;
}
last_core_ = core;
return core;
}
Expand Down
3 changes: 1 addition & 2 deletions clients/drcachesim/simulator/simulator.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2015-2023 Google, Inc. All rights reserved.
* Copyright (c) 2015-2024 Google, Inc. All rights reserved.
* **********************************************************/

/*
Expand Down Expand Up @@ -114,7 +114,6 @@ class simulator_t : public analysis_tool_t {
shard_type_t shard_type_ = SHARD_BY_THREAD;
memtrace_stream_t *serial_stream_ = nullptr;
memref_tid_t last_thread_; // Only used for SHARD_BY_THREAD.
int64_t last_cpu_ = -1;
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
int last_core_;

// For thread mapping to cores:
Expand Down
Loading
Loading