@@ -6529,7 +6529,6 @@ PP(pp_unshift)
65296529 return NORMAL ;
65306530}
65316531
6532-
65336532PP_wrapped (pp_reverse , 0 , 1 )
65346533{
65356534 dSP ; dMARK ;
@@ -6679,10 +6678,50 @@ PP_wrapped(pp_reverse, 0, 1)
66796678 }
66806679 }
66816680 } else {
6681+ STRLEN i = 0 ;
6682+ STRLEN j = len ;
66826683 char * outp = SvPVX (TARG );
6683- const char * p = src + len ;
6684- while (p != src )
6685- * outp ++ = * -- p ;
6684+ /* Take a chunk of bytes from the front and from the
6685+ * back, reverse the bytes in each and and swap the
6686+ * chunks over. This should have generally good
6687+ * performance but also is likely to be optimised
6688+ * into bswap instructions by the compiler.
6689+ */
6690+ #ifdef HAS_QUAD
6691+ while (j - i >= 16 ) {
6692+ * (U64 * )(outp + i ) = _swab_64_ ( * (U64 * )(src + j - 8 ) );
6693+ * (U64 * )(outp + j - 8 ) = _swab_64_ ( * (U64 * )(src + i ) );
6694+ i += 8 ;
6695+ j -= 8 ;
6696+ }
6697+
6698+ if (j - i >= 8 ) {
6699+ * (U32 * )(outp + i ) = _swab_32_ ( * (U32 * )(src + j - 4 ) );
6700+ * (U32 * )(outp + j - 4 ) = _swab_32_ ( * (U32 * )(src + i ) );
6701+ i += 4 ;
6702+ j -= 4 ;
6703+ }
6704+ #else
6705+ while (j - i >= 8 ) {
6706+ * (U32 * )(outp + i ) = _swab_32_ ( * (U32 * )(src + j - 4 ) );
6707+ * (U32 * )(outp + j - 4 ) = _swab_32_ ( * (U32 * )(src + i ) );
6708+ i += 4 ;
6709+ j -= 4 ;
6710+ }
6711+ #endif
6712+ if (j - i >= 4 ) {
6713+ * (U16 * )(outp + i ) = _swab_16_ ( * (U16 * )(src + j - 2 ) );
6714+ * (U16 * )(outp + j - 2 ) = _swab_16_ ( * (U16 * )(src + i ) );
6715+ i += 2 ;
6716+ j -= 2 ;
6717+ }
6718+
6719+ /* Swap any remaining bytes one by one. */
6720+ while (i < j ) {
6721+ outp [i ] = src [j - 1 ];
6722+ outp [j - 1 ] = src [i ];
6723+ i ++ ; j -- ;
6724+ }
66866725 }
66876726 RETURN ;
66886727 }
@@ -6695,8 +6734,8 @@ PP_wrapped(pp_reverse, 0, 1)
66956734
66966735 if (len > 1 ) {
66976736 /* The traditional way, operate on the current byte buffer */
6698- char * down ;
66996737 if (DO_UTF8 (TARG )) { /* first reverse each character */
6738+ char * down ;
67006739 U8 * s = (U8 * )SvPVX (TARG );
67016740 const U8 * send = (U8 * )(s + len );
67026741 while (s < send ) {
@@ -6720,11 +6759,53 @@ PP_wrapped(pp_reverse, 0, 1)
67206759 }
67216760 up = SvPVX (TARG );
67226761 }
6723- down = SvPVX (TARG ) + len - 1 ;
6724- while (down > up ) {
6725- const char tmp = * up ;
6726- * up ++ = * down ;
6727- * down -- = tmp ;
6762+ STRLEN i = 0 ;
6763+ STRLEN j = len ;
6764+ /* Reverse the buffer in place, in chunks where possible */
6765+ #ifdef HAS_QUAD
6766+ while (j - i >= 16 ) {
6767+ U64 lchunk = _swab_64_ ( * (U64 * )(up + j - 8 ) );
6768+ U64 rchunk = _swab_64_ ( * (U64 * )(up + i ) );
6769+ * (U64 * )(up + i ) = lchunk ;
6770+ * (U64 * )(up + j - 8 ) = rchunk ;
6771+ i += 8 ;
6772+ j -= 8 ;
6773+ }
6774+
6775+ if (j - i >= 8 ) {
6776+ U32 lchunk = _swab_32_ ( * (U32 * )(up + j - 4 ) );
6777+ U32 rchunk = _swab_32_ ( * (U32 * )(up + i ) );
6778+ * (U32 * )(up + i ) = lchunk ;
6779+ * (U32 * )(up + j - 4 ) = rchunk ;
6780+ i += 4 ;
6781+ j -= 4 ;
6782+ }
6783+ #else
6784+ while (j - i >= 8 ) {
6785+ U32 lchunk = _swab_32_ ( * (U32 * )(up + j - 4 ) );
6786+ U32 rchunk = _swab_32_ ( * (U32 * )(up + i ) );
6787+ * (U32 * )(up + i ) = lchunk ;
6788+ * (U32 * )(up + j - 4 ) = rchunk ;
6789+ i += 4 ;
6790+ j -= 4 ;
6791+ }
6792+ #endif
6793+ if (j - i >= 4 ) {
6794+ U16 lchunk = _swab_16_ ( * (U16 * )(up + j - 2 ) );
6795+ U16 rchunk = _swab_16_ ( * (U16 * )(up + i ) );
6796+ * (U16 * )(up + i ) = lchunk ;
6797+ * (U16 * )(up + j - 2 ) = rchunk ;
6798+ i += 2 ;
6799+ j -= 2 ;
6800+ }
6801+
6802+ /* Finally, swap any remaining bytes one-by-one. */
6803+ while (i < j ) {
6804+ unsigned char tmp = up [i ];
6805+ up [i ] = up [j - 1 ];
6806+ up [j - 1 ] = tmp ;
6807+ i ++ ;
6808+ j -- ;
67286809 }
67296810 }
67306811 (void )SvPOK_only_UTF8 (TARG );
0 commit comments