@@ -41,30 +41,43 @@ unsafe fn read_usize_unaligned(x: *const usize) -> usize {
4141 core:: mem:: transmute ( x_read)
4242}
4343
44+ /// Loads a `T`-sized chunk from `src` into `dst` at offset `offset`, if that does not exceed
45+ /// `load_sz`. The offset pointers must both be `T`-aligned. Returns the new offset, advanced by the
46+ /// chunk size if a load happened.
47+ #[ cfg( not( feature = "mem-unaligned" ) ) ]
48+ #[ inline( always) ]
49+ unsafe fn load_chunk_aligned < T : Copy > (
50+ src : * const usize ,
51+ dst : * mut usize ,
52+ load_sz : usize ,
53+ offset : usize ,
54+ ) -> usize {
55+ let chunk_sz = core:: mem:: size_of :: < T > ( ) ;
56+ if ( load_sz & chunk_sz) != 0 {
57+ * dst. wrapping_byte_add ( offset) . cast :: < T > ( ) = * src. wrapping_byte_add ( offset) . cast :: < T > ( ) ;
58+ offset | chunk_sz
59+ } else {
60+ offset
61+ }
62+ }
63+
4464/// Load `load_sz` many bytes from `src`, which must be usize-aligned. Acts as if we did a `usize`
4565/// read with the out-of-bounds part filled with 0s.
4666/// `load_sz` be strictly less than `WORD_SIZE`.
4767#[ cfg( not( feature = "mem-unaligned" ) ) ]
4868#[ inline( always) ]
4969unsafe fn load_aligned_partial ( src : * const usize , load_sz : usize ) -> usize {
5070 debug_assert ! ( load_sz < WORD_SIZE ) ;
71+ // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
72+ // (since `load_sz < WORD_SIZE`).
73+ const { assert ! ( WORD_SIZE <= 8 ) } ;
5174
5275 let mut i = 0 ;
5376 let mut out = 0usize ;
54- macro_rules! load_prefix {
55- ( $( $ty: ty) +) => { $(
56- let chunk_sz = core:: mem:: size_of:: <$ty>( ) ;
57- if ( load_sz & chunk_sz) != 0 {
58- // Since we are doing the large reads first, this must still be aligned to `chunk_sz`.
59- * ( & raw mut out) . wrapping_byte_add( i) . cast:: <$ty>( ) = * src. wrapping_byte_add( i) . cast:: <$ty>( ) ;
60- i |= chunk_sz;
61- }
62- ) +} ;
63- }
64- // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
65- // (since `load_size < WORD_SIZE`).
66- const { assert ! ( WORD_SIZE <= 8 ) } ;
67- load_prefix ! ( u32 u16 u8 ) ;
77+ // We load in decreasing order, so the pointers remain sufficiently aligned for the next step.
78+ i = load_chunk_aligned :: < u32 > ( src, & raw mut out, load_sz, i) ;
79+ i = load_chunk_aligned :: < u16 > ( src, & raw mut out, load_sz, i) ;
80+ i = load_chunk_aligned :: < u8 > ( src, & raw mut out, load_sz, i) ;
6881 debug_assert ! ( i == load_sz) ;
6982 out
7083}
@@ -77,25 +90,19 @@ unsafe fn load_aligned_partial(src: *const usize, load_sz: usize) -> usize {
7790#[ inline( always) ]
7891unsafe fn load_aligned_end_partial ( src : * const usize , load_sz : usize ) -> usize {
7992 debug_assert ! ( load_sz < WORD_SIZE ) ;
93+ // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
94+ // (since `load_sz < WORD_SIZE`).
95+ const { assert ! ( WORD_SIZE <= 8 ) } ;
8096
8197 let mut i = 0 ;
8298 let mut out = 0usize ;
83- let start_shift = WORD_SIZE - load_sz;
84- macro_rules! load_prefix {
85- ( $( $ty: ty) +) => { $(
86- let chunk_sz = core:: mem:: size_of:: <$ty>( ) ;
87- if ( load_sz & chunk_sz) != 0 {
88- // Since we are doing the small reads first, `start_shift + i` has in the mean
89- // time become aligned to `chunk_sz`.
90- * ( & raw mut out) . wrapping_byte_add( start_shift + i) . cast:: <$ty>( ) = * src. wrapping_byte_add( start_shift + i) . cast:: <$ty>( ) ;
91- i |= chunk_sz;
92- }
93- ) +} ;
94- }
95- // We can read up to 7 bytes here, which is enough for WORD_SIZE of 8
96- // (since `load_size < WORD_SIZE`).
97- const { assert ! ( WORD_SIZE <= 8 ) } ;
98- load_prefix ! ( u8 u16 u32 ) ;
99+ // Obtain pointers pointing to the beginning of the range we want to load.
100+ let src_shifted = src. wrapping_byte_add ( WORD_SIZE - load_sz) ;
101+ let out_shifted = ( & raw mut out) . wrapping_byte_add ( WORD_SIZE - load_sz) ;
102+ // We load in increasing order, so by the time we reach `u16` things are 2-aligned etc.
103+ i = load_chunk_aligned :: < u8 > ( src_shifted, out_shifted, load_sz, i) ;
104+ i = load_chunk_aligned :: < u16 > ( src_shifted, out_shifted, load_sz, i) ;
105+ i = load_chunk_aligned :: < u32 > ( src_shifted, out_shifted, load_sz, i) ;
99106 debug_assert ! ( i == load_sz) ;
100107 out
101108}
0 commit comments