Skip to content

Commit

Permalink
[vm/compiler] Create leaf runtime entry for memmove.
Browse files Browse the repository at this point in the history
Instead of making a StaticCall to _TypedListBase.nativeSetRange
inside _memMoveN, make a CCall to the memmove leaf runtime entry.

Rename _TypedListBase._nativeSetRange to _setClampedRange, since
it's now only used when per-element clamping is necessary.

Fix the load optimizer so that loads of unboxed fields from freshly
allocated objects do not have the tagged null value forwarded
as their initial post-allocation value.

TEST=co19{,_2}/LibTest/typed_data lib{,_2}/typed_data
     corelib{,_2}/list_test
     vm/cc/LoadOptimizer_LoadDataFieldOfNewTypedData

Issue: #42072
Change-Id: Ib82e24a5b3287fa53099fffd3b563a27d777507e
Cq-Include-Trybots: luci.dart.try:vm-aot-linux-debug-simarm_x64-try,vm-aot-linux-debug-x64-try,vm-aot-linux-debug-x64c-try,vm-kernel-linux-debug-x64-try,vm-kernel-precomp-linux-debug-x64-try,vm-linux-debug-x64-try,vm-linux-debug-x64c-try,vm-mac-debug-arm64-try,vm-aot-linux-release-simarm_x64-try,vm-aot-linux-release-x64-try,vm-aot-mac-release-arm64-try,vm-aot-msan-linux-release-x64-try,vm-msan-linux-release-x64-try,vm-aot-tsan-linux-release-x64-try,vm-tsan-linux-release-x64-try,vm-linux-release-x64-try,vm-mac-release-arm64-try,vm-kernel-precomp-linux-release-x64-try
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/324080
Reviewed-by: Alexander Markov <alexmarkov@google.com>
Commit-Queue: Tess Strickland <sstrickl@google.com>
  • Loading branch information
sstrickl authored and Commit Queue committed Sep 11, 2023
1 parent 8a91749 commit 3f53d22
Show file tree
Hide file tree
Showing 10 changed files with 1,172 additions and 1,061 deletions.
16 changes: 3 additions & 13 deletions runtime/lib/typed_data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ static bool IsUint8(intptr_t cid) {
cid <= kUnmodifiableTypedDataUint8ClampedArrayViewCid;
}

DEFINE_NATIVE_ENTRY(TypedDataBase_setRange, 0, 5) {
DEFINE_NATIVE_ENTRY(TypedDataBase_setClampedRange, 0, 5) {
// This is called after bounds checking, so the numeric inputs are
// guaranteed to be Smis, and the length is guaranteed to be non-zero.
const TypedDataBase& dst =
Expand Down Expand Up @@ -128,19 +128,9 @@ DEFINE_NATIVE_ENTRY(TypedDataBase_setRange, 0, 5) {
ASSERT(length_in_bytes <= src_length_in_bytes - src_start_in_bytes);
#endif

if (!IsClamped(dst.ptr()->GetClassId()) || IsUint8(src.ptr()->GetClassId())) {
// TODO(dartbug.com/42072): We do this when the copy length gets large
// enough that a native call to invoke memmove is faster than the generated
// code from MemoryCopy. Replace the static call to _nativeSetRange with
// a CCall() to a memmove leaf runtime entry and remove the possibility of
// calling _nativeSetRange except in the clamping case.
NoSafepointScope no_safepoint;
memmove(dst.DataAddr(dst_start_in_bytes), src.DataAddr(src_start_in_bytes),
length_in_bytes);
return Object::null();
}

ASSERT_EQUAL(element_size_in_bytes, 1);
ASSERT(IsClamped(dst.ptr()->GetClassId()));
ASSERT(!IsUint8(src.ptr()->GetClassId()));

NoSafepointScope no_safepoint;
uint8_t* dst_data =
Expand Down
2 changes: 1 addition & 1 deletion runtime/vm/bootstrap_natives.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ namespace dart {
V(TypedData_Int32x4Array_new, 2) \
V(TypedData_Float64x2Array_new, 2) \
V(TypedDataBase_length, 1) \
V(TypedDataBase_setRange, 5) \
V(TypedDataBase_setClampedRange, 5) \
V(TypedData_GetInt8, 2) \
V(TypedData_SetInt8, 3) \
V(TypedData_GetUint8, 2) \
Expand Down
5 changes: 5 additions & 0 deletions runtime/vm/compiler/backend/redundancy_elimination.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2244,6 +2244,11 @@ class LoadOptimizer : public ValueObject {
const intptr_t pos = alloc->InputForSlot(*slot);
if (pos != -1) {
forward_def = alloc->InputAt(pos)->definition();
} else if (slot->is_unboxed()) {
// Unboxed fields that are not provided as an input should not
// have a tagged null value forwarded for them, similar to
// payloads of typed data arrays.
continue;
} else {
// Fields not provided as an input to the instruction are
// initialized to null during allocation.
Expand Down
98 changes: 98 additions & 0 deletions runtime/vm/compiler/backend/redundancy_elimination_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,104 @@ ISOLATE_UNIT_TEST_CASE(LoadOptimizer_AliasingViaTypedDataAndUntaggedTypedData) {
}
}

// This test ensures that a LoadNativeField of the PointerBase data field for
// a newly allocated TypedData object does not have tagged null forwarded to it,
// as that's wrong for two reasons: it's an unboxed field, and it is initialized
// during the allocation stub.
ISOLATE_UNIT_TEST_CASE(LoadOptimizer_LoadDataFieldOfNewTypedData) {
using compiler::BlockBuilder;
CompilerState S(thread, /*is_aot=*/false, /*is_optimizing=*/true);
FlowGraphBuilderHelper H;

auto zone = H.flow_graph()->zone();

// We are going to build the following graph:
//
// B0[graph_entry] {
// vc42 <- Constant(42)
// }
//
// B1[function_entry] {
// }
// array <- AllocateTypedData(kTypedDataUint8ArrayCid, vc42)
// view <- AllocateObject(kTypedDataUint8ArrayViewCid)
// v1 <- LoadNativeField(array, Slot::PointerBase_data())
// StoreNativeField(Slot::PointerBase_data(), view, v1, kNoStoreBarrier,
// kInitalizing)
// return view
// }

const auto& lib = Library::Handle(zone, Library::TypedDataLibrary());
EXPECT(!lib.IsNull());
const Class& view_cls = Class::ZoneHandle(
zone, lib.LookupClassAllowPrivate(Symbols::_Uint8ArrayView()));
EXPECT(!view_cls.IsNull());
const Error& err = Error::Handle(zone, view_cls.EnsureIsFinalized(thread));
EXPECT(err.IsNull());

auto vc42 = H.flow_graph()->GetConstant(Integer::Handle(Integer::New(42)));
auto b1 = H.flow_graph()->graph_entry()->normal_entry();

AllocateTypedDataInstr* array;
AllocateObjectInstr* view;
LoadFieldInstr* v1;
StoreFieldInstr* store;
ReturnInstr* ret;

{
BlockBuilder builder(H.flow_graph(), b1);

// array <- AllocateTypedData(kTypedDataUint8ArrayCid, vc42)
array = builder.AddDefinition(
new AllocateTypedDataInstr(InstructionSource(), kTypedDataUint8ArrayCid,
new (zone) Value(vc42), DeoptId::kNone));

// view <- AllocateObject(kTypedDataUint8ArrayViewCid, vta)
view = builder.AddDefinition(
new AllocateObjectInstr(InstructionSource(), view_cls, DeoptId::kNone));

// v1 <- LoadNativeField(array, Slot::PointerBase_data())
v1 = builder.AddDefinition(new LoadFieldInstr(new (zone) Value(array),
Slot::PointerBase_data(),
InstructionSource()));

// StoreNativeField(Slot::PointerBase_data(), view, v1, kNoStoreBarrier,
// kInitalizing)
store = builder.AddInstruction(new StoreFieldInstr(
Slot::PointerBase_data(), new (zone) Value(view), new (zone) Value(v1),
kNoStoreBarrier, InstructionSource(),
StoreFieldInstr::Kind::kInitializing));

// return view
ret = builder.AddInstruction(new ReturnInstr(
InstructionSource(), new Value(view), S.GetNextDeoptId()));
}
H.FinishGraph();

DominatorBasedCSE::Optimize(H.flow_graph());
{
Instruction* alloc_array = nullptr;
Instruction* alloc_view = nullptr;
Instruction* lf = nullptr;
Instruction* sf = nullptr;
Instruction* r = nullptr;
ILMatcher cursor(H.flow_graph(), b1, true);
RELEASE_ASSERT(cursor.TryMatch({
kMatchAndMoveFunctionEntry,
{kMatchAndMoveAllocateTypedData, &alloc_array},
{kMatchAndMoveAllocateObject, &alloc_view},
{kMatchAndMoveLoadField, &lf},
{kMatchAndMoveStoreField, &sf},
{kMatchReturn, &r},
}));
EXPECT(array == alloc_array);
EXPECT(view == alloc_view);
EXPECT(v1 == lf);
EXPECT(store == sf);
EXPECT(ret == r);
}
}

// This test verifies that we correctly alias load/stores into typed array
// which use different element sizes. This is a regression test for
// a fix in 836c04f.
Expand Down
3 changes: 2 additions & 1 deletion runtime/vm/compiler/backend/slot.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ Slot* SlotCache::CreateNativeSlot(Slot::Kind kind) {
case Slot::Kind::k##ClassName##_##FieldName: \
return new (zone_) \
Slot(Slot::Kind::k##ClassName##_##FieldName, \
Slot::IsImmutableBit::encode(FIELD_##mutability), \
Slot::IsImmutableBit::encode(FIELD_##mutability) | \
Slot::IsUnboxedBit::encode(true), \
compiler::target::ClassName::FieldName##_offset(), \
#ClassName "." #FieldName, \
CompileType::FromUnboxedRepresentation(kUnboxed##representation), \
Expand Down
77 changes: 42 additions & 35 deletions runtime/vm/compiler/frontend/kernel_to_il.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1763,28 +1763,27 @@ Fragment FlowGraphBuilder::BuildTypedDataMemMove(const Function& function,
LocalVariable* arg_from_start = parsed_function_->RawParameterVariable(4);

Fragment body;
// If we're copying at least this many elements, calling _nativeSetRange,
// which calls memmove via a native call, is faster than using the code
// currently emitted by the MemoryCopy instruction.
// If we're copying at least this many elements, calling memmove via CCall
// is faster than using the code currently emitted by MemoryCopy.
#if defined(TARGET_ARCH_X64) || defined(TARGET_ARCH_IA32)
// On X86, the breakpoint for using a native call instead of generating a
// loop via MemoryCopy() is around the same as the largest benchmark
// (1048576 elements) on the machines we use.
const intptr_t kCopyLengthForNativeCall = 1024 * 1024;
// On X86, the breakpoint for using CCall instead of generating a loop via
// MemoryCopy() is around the same as the largest benchmark (1048576 elements)
// on the machines we use.
const intptr_t kCopyLengthForCCall = 1024 * 1024;
#else
// On other architectures, when the element size is less than a word,
// we copy in word-sized chunks when possible to get back some speed without
// increasing the number of emitted instructions for MemoryCopy too much, but
// memmove is even more aggressive, copying in 64-byte chunks when possible.
// Thus, the breakpoint for a native call being faster is much lower for our
// benchmarks than for X86.
const intptr_t kCopyLengthForNativeCall = 1024;
// Thus, the breakpoint for a call to memmove being faster is much lower for
// our benchmarks than for X86.
const intptr_t kCopyLengthForCCall = 1024;
#endif

JoinEntryInstr* done = BuildJoinEntry();
TargetEntryInstr *is_small_enough, *is_too_large;
body += LoadLocal(arg_count);
body += IntConstant(kCopyLengthForNativeCall);
body += IntConstant(kCopyLengthForCCall);
body += SmiRelationalOp(Token::kLT);
body += BranchIfTrue(&is_small_enough, &is_too_large);

Expand All @@ -1798,30 +1797,38 @@ Fragment FlowGraphBuilder::BuildTypedDataMemMove(const Function& function,
/*unboxed_inputs=*/false, /*can_overlap=*/true);
use_instruction += Goto(done);

// TODO(dartbug.com/42072): Instead of doing a static call to a native
// method, make a leaf runtime entry for memmove and use CCall.
const Library& lib = Library::Handle(Z, Library::TypedDataLibrary());
ASSERT(!lib.IsNull());
const Class& typed_list_base =
Class::Handle(Z, lib.LookupClassAllowPrivate(Symbols::_TypedListBase()));
ASSERT(!typed_list_base.IsNull());
const auto& error = typed_list_base.EnsureIsFinalized(H.thread());
ASSERT(error == Error::null());
const Function& native_set_range = Function::ZoneHandle(
Z,
typed_list_base.LookupFunctionAllowPrivate(Symbols::_nativeSetRange()));
ASSERT(!native_set_range.IsNull());

Fragment call_native(is_too_large);
call_native += LoadLocal(arg_to);
call_native += LoadLocal(arg_to_start);
call_native += LoadLocal(arg_count);
call_native += LoadLocal(arg_from);
call_native += LoadLocal(arg_from_start);
call_native += StaticCall(TokenPosition::kNoSource, native_set_range, 5,
ICData::kNoRebind);
call_native += Drop();
call_native += Goto(done);
const intptr_t element_size = Instance::ElementSizeFor(cid);
Fragment call_memmove(is_too_large);
call_memmove += LoadLocal(arg_to);
call_memmove += LoadUntagged(compiler::target::PointerBase::data_offset());
call_memmove += ConvertUntaggedToUnboxed(kUnboxedIntPtr);
call_memmove += LoadLocal(arg_to_start);
call_memmove += IntConstant(element_size);
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
call_memmove += UnboxTruncate(kUnboxedIntPtr);
call_memmove +=
BinaryIntegerOp(Token::kADD, kUnboxedIntPtr, /*is_truncating=*/true);
call_memmove += LoadLocal(arg_from);
call_memmove += LoadUntagged(compiler::target::PointerBase::data_offset());
call_memmove += ConvertUntaggedToUnboxed(kUnboxedIntPtr);
call_memmove += LoadLocal(arg_from_start);
call_memmove += IntConstant(element_size);
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
call_memmove += UnboxTruncate(kUnboxedIntPtr);
call_memmove +=
BinaryIntegerOp(Token::kADD, kUnboxedIntPtr, /*is_truncating=*/true);
call_memmove += LoadLocal(arg_count);
call_memmove += IntConstant(element_size);
call_memmove += SmiBinaryOp(Token::kMUL, /*is_truncating=*/true);
call_memmove += UnboxTruncate(kUnboxedIntPtr);
call_memmove += LoadThread();
call_memmove += LoadUntagged(
compiler::target::Thread::OffsetFromThread(&kMemoryMoveRuntimeEntry));
call_memmove +=
ConvertUntaggedToUnboxed(kUnboxedFfiIntPtr); // function address.
call_memmove += CCall(3);
call_memmove += Drop();
call_memmove += Goto(done);

body.current = done;
body += NullConstant();
Expand Down
Loading

0 comments on commit 3f53d22

Please sign in to comment.