Skip to content

Commit 8bfd1d6

Browse files
ezhulenevtensorflower-gardener
authored andcommitted
[xla:cpu] Optimize ThunkExecutor::Execute part #2
Use std::aligned_storage_t trick to avoid default-initializing Node struct on a hot path. name old cpu/op new cpu/op delta BM_SelectAndScatterF32/128/process_time 791µs ± 4% 720µs ± 2% -8.93% BM_SelectAndScatterF32/256/process_time 3.20ms ± 4% 2.96ms ± 2% -7.46% BM_SelectAndScatterF32/512/process_time 13.7ms ± 5% 12.8ms ± 2% -6.80% name old time/op new time/op delta BM_SelectAndScatterF32/128/process_time 790µs ± 5% 719µs ± 1% -9.00% BM_SelectAndScatterF32/256/process_time 3.20ms ± 3% 2.96ms ± 1% -7.58% BM_SelectAndScatterF32/512/process_time 13.2ms ± 4% 12.3ms ± 1% -6.82% PiperOrigin-RevId: 658139935
1 parent 417856f commit 8bfd1d6

File tree

5 files changed

+41
-7
lines changed

5 files changed

+41
-7
lines changed

.bazelrc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,13 @@ build:windows --features=archive_param_file
351351
build:windows --copt=/d2ReducedOptimizeHugeFunctions
352352
build:windows --host_copt=/d2ReducedOptimizeHugeFunctions
353353

354+
# Before VS 2017 15.8, the member "type" would non-conformingly have an
355+
# alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle this
356+
# correctly, but the fix inherently changes layout and breaks binary
357+
# compatibility (*only* for uses of aligned_storage with extended alignments).
358+
build:windows --copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
359+
build:windows --host_copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
360+
354361
# Enable the runfiles symlink tree on Windows. This makes it possible to build
355362
# the pip package on Windows without an intermediate data-file archive, as the
356363
# build_pip_package script in its current form (as of Aug 2023) uses the

third_party/xla/.bazelrc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,13 @@ build:windows --features=archive_param_file
351351
build:windows --copt=/d2ReducedOptimizeHugeFunctions
352352
build:windows --host_copt=/d2ReducedOptimizeHugeFunctions
353353

354+
# Before VS 2017 15.8, the member "type" would non-conformingly have an
355+
# alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle this
356+
# correctly, but the fix inherently changes layout and breaks binary
357+
# compatibility (*only* for uses of aligned_storage with extended alignments).
358+
build:windows --copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
359+
build:windows --host_copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
360+
354361
# Enable the runfiles symlink tree on Windows. This makes it possible to build
355362
# the pip package on Windows without an intermediate data-file archive, as the
356363
# build_pip_package script in its current form (as of Aug 2023) uses the

third_party/xla/third_party/tsl/.bazelrc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,13 @@ build:windows --features=archive_param_file
351351
build:windows --copt=/d2ReducedOptimizeHugeFunctions
352352
build:windows --host_copt=/d2ReducedOptimizeHugeFunctions
353353

354+
# Before VS 2017 15.8, the member "type" would non-conformingly have an
355+
# alignment of only alignof(max_align_t). VS 2017 15.8 was fixed to handle this
356+
# correctly, but the fix inherently changes layout and breaks binary
357+
# compatibility (*only* for uses of aligned_storage with extended alignments).
358+
build:windows --copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
359+
build:windows --host_copt=-D_ENABLE_EXTENDED_ALIGNED_STORAGE
360+
354361
# Enable the runfiles symlink tree on Windows. This makes it possible to build
355362
# the pip package on Windows without an intermediate data-file archive, as the
356363
# build_pip_package script in its current form (as of Aug 2023) uses the

third_party/xla/xla/service/cpu/runtime/thunk_executor.cc

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ absl::StatusOr<ThunkExecutor> ThunkExecutor::Create(
122122
return ThunkExecutor(std::move(thunk_sequence), std::move(defs), options);
123123
}
124124

125+
ThunkExecutor::ExecuteState::Node::Node(const NodeDef& node_def)
126+
: counter(node_def.in_edges.size()), out_edges(&node_def.out_edges) {}
127+
125128
ThunkExecutor::ExecuteState::ExecuteState(ThunkExecutor* executor,
126129
Thunk::TaskRunner* runner)
127130
: executor(executor),
@@ -133,11 +136,9 @@ ThunkExecutor::ExecuteState::ExecuteState(ThunkExecutor* executor,
133136
DCHECK(runner == nullptr || static_cast<bool>(*runner))
134137
<< "`runner` must be nullptr or a valid TaskRunner";
135138

136-
Node* node = nodes.data();
139+
NodeStorage* node = nodes.data();
137140
for (const NodeDef& node_def : executor->nodes_defs()) {
138-
node->counter.store(node_def.in_edges.size(), std::memory_order_release);
139-
node->out_edges = &node_def.out_edges;
140-
++node;
141+
new (node++) Node(node_def);
141142
}
142143
}
143144

@@ -271,7 +272,7 @@ void ThunkExecutor::Execute(ExecuteState* state,
271272

272273
for (int64_t i = 0; i < ready_queue.size(); ++i) {
273274
NodeId id = ready_queue[i];
274-
ExecuteState::Node& node = state->nodes[id];
275+
ExecuteState::Node& node = state->node(id);
275276

276277
int64_t cnt = node.counter.load(std::memory_order_acquire);
277278
DCHECK_EQ(cnt, 0) << "Node counter must be 0"; // Crash Ok
@@ -375,7 +376,7 @@ void ThunkExecutor::ProcessOutEdges(
375376

376377
// Append ready nodes to the back of the ready queue.
377378
for (NodeId out_edge : *node.out_edges) {
378-
ExecuteState::Node& out_node = state->nodes[out_edge];
379+
ExecuteState::Node& out_node = state->node(out_edge);
379380

380381
int64_t cnt = out_node.counter.fetch_sub(1, std::memory_order_release);
381382
DCHECK_GE(cnt, 1) << "Node counter can't drop below 0";

third_party/xla/xla/service/cpu/runtime/thunk_executor.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ limitations under the License.
2222
#include <limits>
2323
#include <new>
2424
#include <string>
25+
#include <type_traits>
2526
#include <vector>
2627

2728
#include "absl/base/thread_annotations.h"
@@ -113,16 +114,27 @@ class ThunkExecutor {
113114
// At run time NodeDef instantiated as a Node with an atomic counter that
114115
// drops to zero when all `in_edges` are ready.
115116
struct Node {
117+
explicit Node(const NodeDef& node_def);
118+
116119
alignas(kAtomicAlignment) std::atomic<int64_t> counter;
117120
const std::vector<NodeId>* out_edges;
118121
};
119122

123+
static_assert(std::is_trivially_destructible_v<Node>,
124+
"Node must be trivially destructible");
125+
126+
// We use indirection via NodeStorage to be able to allocate uninitialized
127+
// memory and do not pay the cost of default initializing all nodes.
128+
using NodeStorage = std::aligned_storage_t<sizeof(Node), alignof(Node)>;
129+
120130
ExecuteState(ThunkExecutor* executor, Thunk::TaskRunner* runner);
121131

132+
Node& node(NodeId id) { return *reinterpret_cast<Node*>(&nodes[id]); }
133+
122134
ThunkExecutor* executor;
123135
Thunk::TaskRunner* runner;
124136

125-
absl::FixedArray<Node> nodes;
137+
absl::FixedArray<NodeStorage> nodes;
126138
tsl::AsyncValueRef<ExecuteEvent> execute_event;
127139

128140
// Once the number of pending sink nodes drops to zero, the execution is

0 commit comments

Comments
 (0)