@@ -81,8 +81,8 @@ cfg_select! {
8181                // use `loadu`, which supports unaligned loading. 
8282                let  chunk = unsafe  {  _mm_loadu_si128( chunk. as_ptr( )  as  * const  __m128i)  } ; 
8383
84-                 // For character in the chunk, see if its byte value is < 0, which  
85-                 // indicates that it's part of a UTF-8 char. 
84+                 // For each  character in the chunk, see if its byte value is < 0, 
85+                 // which  indicates that it's part of a UTF-8 char. 
8686                let  multibyte_test = _mm_cmplt_epi8( chunk,  _mm_set1_epi8( 0 ) ) ; 
8787                // Create a bit mask from the comparison results. 
8888                let  multibyte_mask = _mm_movemask_epi8( multibyte_test) ; 
@@ -132,8 +132,111 @@ cfg_select! {
132132            } 
133133        } 
134134    } 
135+     target_arch = "loongarch64"  => { 
136+         fn  analyze_source_file_dispatch( 
137+             src:  & str , 
138+             lines:  & mut  Vec <RelativeBytePos >, 
139+             multi_byte_chars:  & mut  Vec <MultiByteChar >, 
140+         )  { 
141+             use  std:: arch:: is_loongarch_feature_detected; 
142+ 
143+             if  is_loongarch_feature_detected!( "lsx" )  { 
144+                 unsafe  { 
145+                     analyze_source_file_lsx( src,  lines,  multi_byte_chars) ; 
146+                 } 
147+             }  else { 
148+                 analyze_source_file_generic( 
149+                     src, 
150+                     src. len( ) , 
151+                     RelativeBytePos :: from_u32( 0 ) , 
152+                     lines, 
153+                     multi_byte_chars, 
154+                 ) ; 
155+             } 
156+         } 
157+ 
158+         /// Checks 16 byte chunks of text at a time. If the chunk contains 
159+          /// something other than printable ASCII characters and newlines, the 
160+          /// function falls back to the generic implementation. Otherwise it uses 
161+          /// LSX intrinsics to quickly find all newlines. 
162+          #[ target_feature( enable = "lsx" ) ] 
163+         unsafe  fn  analyze_source_file_lsx( 
164+             src:  & str , 
165+             lines:  & mut  Vec <RelativeBytePos >, 
166+             multi_byte_chars:  & mut  Vec <MultiByteChar >, 
167+         )  { 
168+             use  std:: arch:: loongarch64:: * ; 
169+ 
170+             const  CHUNK_SIZE :  usize  = 16 ; 
171+ 
172+             let  ( chunks,  tail)  = src. as_bytes( ) . as_chunks:: <CHUNK_SIZE >( ) ; 
173+ 
174+             // This variable keeps track of where we should start decoding a 
175+             // chunk. If a multi-byte character spans across chunk boundaries, 
176+             // we need to skip that part in the next chunk because we already 
177+             // handled it. 
178+             let  mut  intra_chunk_offset = 0 ; 
179+ 
180+             for  ( chunk_index,  chunk)  in chunks. iter( ) . enumerate( )  { 
181+                 // All LSX memory instructions support unaligned access, so using 
182+                 // vld is fine. 
183+                 let  chunk = unsafe  {  lsx_vld:: <0 >( chunk. as_ptr( )  as  * const  i8 )  } ; 
184+ 
185+                 // For each character in the chunk, see if its byte value is < 0, 
186+                 // which indicates that it's part of a UTF-8 char. 
187+                 let  multibyte_mask = lsx_vmskltz_b( chunk) ; 
188+                 // Create a bit mask from the comparison results. 
189+                 let  multibyte_mask = lsx_vpickve2gr_w:: <0 >( multibyte_mask) ; 
190+ 
191+                 // If the bit mask is all zero, we only have ASCII chars here: 
192+                 if  multibyte_mask == 0  { 
193+                     assert!( intra_chunk_offset == 0 ) ; 
194+ 
195+                     // Check for newlines in the chunk 
196+                     let  newlines_test = lsx_vseqi_b:: <{ b'\n'  as  i32 } >( chunk) ; 
197+                     let  newlines_mask = lsx_vmskltz_b( newlines_test) ; 
198+                     let  mut  newlines_mask = lsx_vpickve2gr_w:: <0 >( newlines_mask) ; 
199+ 
200+                     let  output_offset = RelativeBytePos :: from_usize( chunk_index *  CHUNK_SIZE  + 1 ) ; 
201+ 
202+                     while  newlines_mask != 0  { 
203+                         let  index = newlines_mask. trailing_zeros( ) ; 
204+ 
205+                         lines. push( RelativeBytePos ( index)  + output_offset) ; 
206+ 
207+                         // Clear the bit, so we can find the next one. 
208+                         newlines_mask &= newlines_mask - 1 ; 
209+                     } 
210+                 }  else { 
211+                     // The slow path. 
212+                     // There are multibyte chars in here, fallback to generic decoding. 
213+                     let  scan_start = chunk_index *  CHUNK_SIZE  + intra_chunk_offset; 
214+                     intra_chunk_offset = analyze_source_file_generic( 
215+                         & src[ scan_start..] , 
216+                         CHUNK_SIZE  - intra_chunk_offset, 
217+                         RelativeBytePos :: from_usize( scan_start) , 
218+                         lines, 
219+                         multi_byte_chars, 
220+                     ) ; 
221+                 } 
222+             } 
223+ 
224+             // There might still be a tail left to analyze 
225+             let  tail_start = src. len( )  - tail. len( )  + intra_chunk_offset; 
226+             if  tail_start < src. len( )  { 
227+                 analyze_source_file_generic( 
228+                     & src[ tail_start..] , 
229+                     src. len( )  - tail_start, 
230+                     RelativeBytePos :: from_usize( tail_start) , 
231+                     lines, 
232+                     multi_byte_chars, 
233+                 ) ; 
234+             } 
235+         } 
236+     } 
135237    _ => { 
136-         // The target (or compiler version) does not support SSE2 ... 
238+         // The target (or compiler version) does not support vector instructions 
239+         // our specialized implementations need (x86 SSE2, loongarch64 LSX)... 
137240        fn  analyze_source_file_dispatch( 
138241            src:  & str , 
139242            lines:  & mut  Vec <RelativeBytePos >, 
0 commit comments