16
16
// feature is present at compile-time. We don't bother detecting other features.
17
17
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18
18
19
+ use core:: arch:: asm;
19
20
use core:: intrinsics;
20
21
use core:: mem;
21
22
@@ -34,16 +35,26 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
34
35
35
36
#[ inline( always) ]
36
37
#[ cfg( not( target_feature = "ermsb" ) ) ]
37
- pub unsafe fn copy_forward ( dest : * mut u8 , src : * const u8 , count : usize ) {
38
- let qword_count = count >> 3 ;
39
- let byte_count = count & 0b111 ;
40
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
41
- core:: arch:: asm!(
42
- "repe movsq (%rsi), (%rdi)" ,
43
- "mov {byte_count:e}, %ecx" ,
44
- "repe movsb (%rsi), (%rdi)" ,
45
- byte_count = in( reg) byte_count,
38
+ pub unsafe fn copy_forward ( mut dest : * mut u8 , mut src : * const u8 , count : usize ) {
39
+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
40
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
41
+ asm ! (
42
+ "rep movsb" ,
43
+ inout( "ecx" ) pre_byte_count => _,
44
+ inout( "rdi" ) dest => dest,
45
+ inout( "rsi" ) src => src,
46
+ options( att_syntax, nostack, preserves_flags)
47
+ ) ;
48
+ asm ! (
49
+ "rep movsq" ,
46
50
inout( "rcx" ) qword_count => _,
51
+ inout( "rdi" ) dest => dest,
52
+ inout( "rsi" ) src => src,
53
+ options( att_syntax, nostack, preserves_flags)
54
+ ) ;
55
+ asm ! (
56
+ "rep movsb" ,
57
+ inout( "ecx" ) byte_count => _,
47
58
inout( "rdi" ) dest => _,
48
59
inout( "rsi" ) src => _,
49
60
options( att_syntax, nostack, preserves_flags)
@@ -52,22 +63,28 @@ pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
52
63
53
64
#[ inline( always) ]
54
65
pub unsafe fn copy_backward ( dest : * mut u8 , src : * const u8 , count : usize ) {
55
- let qword_count = count >> 3 ;
56
- let byte_count = count & 0b111 ;
57
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
58
- core:: arch:: asm!(
66
+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
67
+ // We can't separate this block due to std/cld
68
+ asm ! (
59
69
"std" ,
60
- "repe movsq (%rsi), (%rdi)" ,
61
- "movl {byte_count:e}, %ecx" ,
62
- "addq $7, %rdi" ,
63
- "addq $7, %rsi" ,
64
- "repe movsb (%rsi), (%rdi)" ,
70
+ "rep movsb" ,
71
+ "sub $7, %rsi" ,
72
+ "sub $7, %rdi" ,
73
+ "mov {qword_count}, %rcx" ,
74
+ "rep movsq" ,
75
+ "test {pre_byte_count:e}, {pre_byte_count:e}" ,
76
+ "add $7, %rsi" ,
77
+ "add $7, %rdi" ,
78
+ "mov {pre_byte_count:e}, %ecx" ,
79
+ "rep movsb" ,
65
80
"cld" ,
66
- byte_count = in( reg) byte_count,
67
- inout( "rcx" ) qword_count => _,
68
- inout( "rdi" ) dest. add( count) . wrapping_sub( 8 ) => _,
69
- inout( "rsi" ) src. add( count) . wrapping_sub( 8 ) => _,
70
- options( att_syntax, nostack)
81
+ pre_byte_count = in( reg) pre_byte_count,
82
+ qword_count = in( reg) qword_count,
83
+ inout( "ecx" ) byte_count => _,
84
+ inout( "rdi" ) dest. add( count - 1 ) => _,
85
+ inout( "rsi" ) src. add( count - 1 ) => _,
86
+ // We modify flags, but we restore it afterwards
87
+ options( att_syntax, nostack, preserves_flags)
71
88
) ;
72
89
}
73
90
@@ -86,18 +103,29 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
86
103
87
104
#[ inline( always) ]
88
105
#[ cfg( not( target_feature = "ermsb" ) ) ]
89
- pub unsafe fn set_bytes ( dest : * mut u8 , c : u8 , count : usize ) {
90
- let qword_count = count >> 3 ;
91
- let byte_count = count & 0b111 ;
92
- // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
93
- core:: arch:: asm!(
94
- "repe stosq %rax, (%rdi)" ,
95
- "mov {byte_count:e}, %ecx" ,
96
- "repe stosb %al, (%rdi)" ,
97
- byte_count = in( reg) byte_count,
106
+ pub unsafe fn set_bytes ( mut dest : * mut u8 , c : u8 , count : usize ) {
107
+ let c = c as u64 * 0x0101_0101_0101_0101 ;
108
+ let ( pre_byte_count, qword_count, byte_count) = rep_param ( dest, count) ;
109
+ // Separating the blocks gives the compiler more freedom to reorder instructions.
110
+ asm ! (
111
+ "rep stosb" ,
112
+ inout( "ecx" ) pre_byte_count => _,
113
+ inout( "rdi" ) dest => dest,
114
+ in( "rax" ) c,
115
+ options( att_syntax, nostack, preserves_flags)
116
+ ) ;
117
+ asm ! (
118
+ "rep stosq" ,
98
119
inout( "rcx" ) qword_count => _,
120
+ inout( "rdi" ) dest => dest,
121
+ in( "rax" ) c,
122
+ options( att_syntax, nostack, preserves_flags)
123
+ ) ;
124
+ asm ! (
125
+ "rep stosb" ,
126
+ inout( "ecx" ) byte_count => _,
99
127
inout( "rdi" ) dest => _,
100
- in( "rax" ) ( c as u64 ) * 0x0101010101010101 ,
128
+ in( "rax" ) c ,
101
129
options( att_syntax, nostack, preserves_flags)
102
130
) ;
103
131
}
@@ -156,3 +184,13 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
156
184
c16 ( a. cast ( ) , b. cast ( ) , n)
157
185
}
158
186
}
187
+
188
+ /// Determine optimal parameters for a `rep` instruction.
189
+ fn rep_param ( dest : * mut u8 , mut count : usize ) -> ( usize , usize , usize ) {
190
+ // Unaligned writes are still slow on modern processors, so align the destination address.
191
+ let pre_byte_count = ( ( 8 - ( dest as usize & 0b111 ) ) & 0b111 ) . min ( count) ;
192
+ count -= pre_byte_count;
193
+ let qword_count = count >> 3 ;
194
+ let byte_count = count & 0b111 ;
195
+ ( pre_byte_count, qword_count, byte_count)
196
+ }
0 commit comments