Skip to content

Commit 9dfe467

Browse files
authoredJul 28, 2022
Merge pull request #474 from Demindiro/x86_64-mem-align-dest
2 parents bbf14f3 + ef37a23 commit 9dfe467

File tree

1 file changed

+71
-33
lines changed

1 file changed

+71
-33
lines changed
 

‎src/mem/x86_64.rs

+71-33
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// feature is present at compile-time. We don't bother detecting other features.
1717
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
1818

19+
use core::arch::asm;
1920
use core::intrinsics;
2021
use core::mem;
2122

@@ -34,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
3435

3536
#[inline(always)]
3637
#[cfg(not(target_feature = "ermsb"))]
37-
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
38-
let qword_count = count >> 3;
39-
let byte_count = count & 0b111;
40-
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
41-
core::arch::asm!(
42-
"repe movsq (%rsi), (%rdi)",
43-
"mov {byte_count:e}, %ecx",
44-
"repe movsb (%rsi), (%rdi)",
45-
byte_count = in(reg) byte_count,
38+
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
39+
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
40+
// Separating the blocks gives the compiler more freedom to reorder instructions.
41+
asm!(
42+
"rep movsb",
43+
inout("ecx") pre_byte_count => _,
44+
inout("rdi") dest => dest,
45+
inout("rsi") src => src,
46+
options(att_syntax, nostack, preserves_flags)
47+
);
48+
asm!(
49+
"rep movsq",
4650
inout("rcx") qword_count => _,
51+
inout("rdi") dest => dest,
52+
inout("rsi") src => src,
53+
options(att_syntax, nostack, preserves_flags)
54+
);
55+
asm!(
56+
"rep movsb",
57+
inout("ecx") byte_count => _,
4758
inout("rdi") dest => _,
4859
inout("rsi") src => _,
4960
options(att_syntax, nostack, preserves_flags)
@@ -52,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
5263

5364
#[inline(always)]
5465
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
55-
let qword_count = count >> 3;
56-
let byte_count = count & 0b111;
57-
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
58-
core::arch::asm!(
66+
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
67+
// We can't separate this block due to std/cld
68+
asm!(
5969
"std",
60-
"repe movsq (%rsi), (%rdi)",
61-
"movl {byte_count:e}, %ecx",
62-
"addq $7, %rdi",
63-
"addq $7, %rsi",
64-
"repe movsb (%rsi), (%rdi)",
70+
"rep movsb",
71+
"sub $7, %rsi",
72+
"sub $7, %rdi",
73+
"mov {qword_count}, %rcx",
74+
"rep movsq",
75+
"test {pre_byte_count:e}, {pre_byte_count:e}",
76+
"add $7, %rsi",
77+
"add $7, %rdi",
78+
"mov {pre_byte_count:e}, %ecx",
79+
"rep movsb",
6580
"cld",
66-
byte_count = in(reg) byte_count,
67-
inout("rcx") qword_count => _,
68-
inout("rdi") dest.add(count).wrapping_sub(8) => _,
69-
inout("rsi") src.add(count).wrapping_sub(8) => _,
70-
options(att_syntax, nostack)
81+
pre_byte_count = in(reg) pre_byte_count,
82+
qword_count = in(reg) qword_count,
83+
inout("ecx") byte_count => _,
84+
inout("rdi") dest.add(count - 1) => _,
85+
inout("rsi") src.add(count - 1) => _,
86+
// We modify flags, but we restore it afterwards
87+
options(att_syntax, nostack, preserves_flags)
7188
);
7289
}
7390

@@ -86,18 +103,29 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
86103

87104
#[inline(always)]
88105
#[cfg(not(target_feature = "ermsb"))]
89-
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
90-
let qword_count = count >> 3;
91-
let byte_count = count & 0b111;
92-
// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
93-
core::arch::asm!(
94-
"repe stosq %rax, (%rdi)",
95-
"mov {byte_count:e}, %ecx",
96-
"repe stosb %al, (%rdi)",
97-
byte_count = in(reg) byte_count,
106+
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
107+
let c = c as u64 * 0x0101_0101_0101_0101;
108+
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
109+
// Separating the blocks gives the compiler more freedom to reorder instructions.
110+
asm!(
111+
"rep stosb",
112+
inout("ecx") pre_byte_count => _,
113+
inout("rdi") dest => dest,
114+
in("rax") c,
115+
options(att_syntax, nostack, preserves_flags)
116+
);
117+
asm!(
118+
"rep stosq",
98119
inout("rcx") qword_count => _,
120+
inout("rdi") dest => dest,
121+
in("rax") c,
122+
options(att_syntax, nostack, preserves_flags)
123+
);
124+
asm!(
125+
"rep stosb",
126+
inout("ecx") byte_count => _,
99127
inout("rdi") dest => _,
100-
in("rax") (c as u64) * 0x0101010101010101,
128+
in("rax") c,
101129
options(att_syntax, nostack, preserves_flags)
102130
);
103131
}
@@ -156,3 +184,13 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
156184
c16(a.cast(), b.cast(), n)
157185
}
158186
}
187+
188+
/// Determine optimal parameters for a `rep` instruction.
189+
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
190+
// Unaligned writes are still slow on modern processors, so align the destination address.
191+
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
192+
count -= pre_byte_count;
193+
let qword_count = count >> 3;
194+
let byte_count = count & 0b111;
195+
(pre_byte_count, qword_count, byte_count)
196+
}

0 commit comments

Comments
 (0)
Please sign in to comment.