Skip to content

Commit ef37a23

Browse files
committed
Remove branches around rep movsb/stosb
While it is measurably faster for older CPUs, removing them keeps the code smaller and is likely more beneficial for newer CPUs.
1 parent ae557bd commit ef37a23

File tree

1 file changed

+28
-45
lines changed

1 file changed

+28
-45
lines changed

src/mem/x86_64.rs

+28-45
Original file line numberDiff line numberDiff line change
@@ -38,33 +38,27 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
3838
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
3939
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
4040
// Separating the blocks gives the compiler more freedom to reorder instructions.
41-
// It also allows us to trivially skip the rep movsb, which is faster when memcpying
42-
// aligned data.
43-
if pre_byte_count > 0 {
44-
asm!(
45-
"rep movsb",
46-
inout("ecx") pre_byte_count => _,
47-
inout("rdi") dest => dest,
48-
inout("rsi") src => src,
49-
options(att_syntax, nostack, preserves_flags)
50-
);
51-
}
41+
asm!(
42+
"rep movsb",
43+
inout("ecx") pre_byte_count => _,
44+
inout("rdi") dest => dest,
45+
inout("rsi") src => src,
46+
options(att_syntax, nostack, preserves_flags)
47+
);
5248
asm!(
5349
"rep movsq",
5450
inout("rcx") qword_count => _,
5551
inout("rdi") dest => dest,
5652
inout("rsi") src => src,
5753
options(att_syntax, nostack, preserves_flags)
5854
);
59-
if byte_count > 0 {
60-
asm!(
61-
"rep movsb",
62-
inout("ecx") byte_count => _,
63-
inout("rdi") dest => _,
64-
inout("rsi") src => _,
65-
options(att_syntax, nostack, preserves_flags)
66-
);
67-
}
55+
asm!(
56+
"rep movsb",
57+
inout("ecx") byte_count => _,
58+
inout("rdi") dest => _,
59+
inout("rsi") src => _,
60+
options(att_syntax, nostack, preserves_flags)
61+
);
6862
}
6963

7064
#[inline(always)]
@@ -73,21 +67,16 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
7367
// We can't separate this block due to std/cld
7468
asm!(
7569
"std",
76-
"test %ecx, %ecx",
77-
"jz 1f",
7870
"rep movsb",
79-
"1:",
8071
"sub $7, %rsi",
8172
"sub $7, %rdi",
8273
"mov {qword_count}, %rcx",
8374
"rep movsq",
8475
"test {pre_byte_count:e}, {pre_byte_count:e}",
85-
"jz 1f",
8676
"add $7, %rsi",
8777
"add $7, %rdi",
8878
"mov {pre_byte_count:e}, %ecx",
8979
"rep movsb",
90-
"1:",
9180
"cld",
9281
pre_byte_count = in(reg) pre_byte_count,
9382
qword_count = in(reg) qword_count,
@@ -118,33 +107,27 @@ pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
118107
let c = c as u64 * 0x0101_0101_0101_0101;
119108
let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
120109
// Separating the blocks gives the compiler more freedom to reorder instructions.
121-
// It also allows us to trivially skip the rep stosb, which is faster when memcpying
122-
// aligned data.
123-
if pre_byte_count > 0 {
124-
asm!(
125-
"rep stosb",
126-
inout("ecx") pre_byte_count => _,
127-
inout("rdi") dest => dest,
128-
in("rax") c,
129-
options(att_syntax, nostack, preserves_flags)
130-
);
131-
}
110+
asm!(
111+
"rep stosb",
112+
inout("ecx") pre_byte_count => _,
113+
inout("rdi") dest => dest,
114+
in("rax") c,
115+
options(att_syntax, nostack, preserves_flags)
116+
);
132117
asm!(
133118
"rep stosq",
134119
inout("rcx") qword_count => _,
135120
inout("rdi") dest => dest,
136121
in("rax") c,
137122
options(att_syntax, nostack, preserves_flags)
138123
);
139-
if byte_count > 0 {
140-
asm!(
141-
"rep stosb",
142-
inout("ecx") byte_count => _,
143-
inout("rdi") dest => _,
144-
in("rax") c,
145-
options(att_syntax, nostack, preserves_flags)
146-
);
147-
}
124+
asm!(
125+
"rep stosb",
126+
inout("ecx") byte_count => _,
127+
inout("rdi") dest => _,
128+
in("rax") c,
129+
options(att_syntax, nostack, preserves_flags)
130+
);
148131
}
149132

150133
#[inline(always)]

0 commit comments

Comments
 (0)