Skip to content

Commit

Permalink
Implement Data.Text.toTitle directly, without streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodigrim committed Aug 28, 2024
1 parent c1cabeb commit e954933
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 38 deletions.
20 changes: 10 additions & 10 deletions cbits/aarch64/measure_off.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,16 @@ static inline const ssize_t measure_off_neon(const uint8_t *src, const uint8_t *
while (src < srcend - 63){
uint8x16_t w128[4] = {vld1q_u8(src), vld1q_u8(src + 16), vld1q_u8(src + 32), vld1q_u8(src + 48)};
// Which bytes are either < 128 or >= 192?
uint8x16_t mask0 = vcgtq_s8(w128[0], vdupq_n_u8(0xBF));
uint8x16_t mask1 = vcgtq_s8(w128[1], vdupq_n_u8(0xBF));
uint8x16_t mask2 = vcgtq_s8(w128[2], vdupq_n_u8(0xBF));
uint8x16_t mask3 = vcgtq_s8(w128[3], vdupq_n_u8(0xBF));
uint8x16_t mask0 = vcgtq_s8((int8x16_t)w128[0], vdupq_n_s8(0xBF));
uint8x16_t mask1 = vcgtq_s8((int8x16_t)w128[1], vdupq_n_s8(0xBF));
uint8x16_t mask2 = vcgtq_s8((int8x16_t)w128[2], vdupq_n_s8(0xBF));
uint8x16_t mask3 = vcgtq_s8((int8x16_t)w128[3], vdupq_n_s8(0xBF));

uint8x16_t mask01 = vaddq_s8(mask0, mask1);
uint8x16_t mask23 = vaddq_s8(mask2, mask3);
uint8x16_t mask = vaddq_s8(mask01, mask23);
uint8x16_t mask01 = vaddq_u8(mask0, mask1);
uint8x16_t mask23 = vaddq_u8(mask2, mask3);
uint8x16_t mask = vaddq_u8(mask01, mask23);

size_t leads = (size_t)(-vaddvq_s8(mask));
size_t leads = (size_t)(-vaddvq_s8((int8x16_t)mask));

if (cnt < leads) break;
cnt-= leads;
Expand All @@ -79,8 +79,8 @@ static inline const ssize_t measure_off_neon(const uint8_t *src, const uint8_t *
while (src < srcend - 15){
uint8x16_t w128 = vld1q_u8(src);
// Which bytes are either < 128 or >= 192?
uint8x16_t mask = vcgtq_s8(w128, vdupq_n_u8(0xBF));
size_t leads = (size_t)(-vaddvq_s8(mask));
uint8x16_t mask = vcgtq_s8((int8x16_t)w128, vdupq_n_s8(0xBF));
size_t leads = (size_t)(-vaddvq_s8((int8x16_t)mask));
if (cnt < leads) break;
cnt-= leads;
src+= 16;
Expand Down
6 changes: 4 additions & 2 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
reverseIter_, unsafeHead, unsafeTail, iterArray, reverseIterArray)
import Data.Text.Internal.Search (indices)
import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, filter_)
import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, toTitleNonEmpty, filter_)
#if defined(__HADDOCK__)
import Data.ByteString (ByteString)
import qualified Data.Text.Lazy as L
Expand Down Expand Up @@ -900,7 +900,9 @@ toUpper = \t ->
--
-- @since 1.0.0.0
toTitle :: Text -> Text
toTitle t = unstream (S.toTitle (stream t))
toTitle = \t ->
if null t then empty
else toTitleNonEmpty t
{-# INLINE toTitle #-}

-- | /O(n)/ Left-justify a string to the given length, using the
Expand Down
152 changes: 126 additions & 26 deletions src/Data/Text/Internal/Transformation.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns, CPP, MagicHash #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE PartialTypeSignatures #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE UnliftedFFITypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE PartialTypeSignatures #-}

{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
Expand All @@ -25,24 +28,26 @@ module Data.Text.Internal.Transformation
, toCaseFoldNonEmpty
, toLowerNonEmpty
, toUpperNonEmpty
, toTitleNonEmpty
, filter_
) where

import Prelude (Char, Bool(..), Int,

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’
Ord(..),
Monad(..), pure,
(+), (-), ($),
not, return, otherwise)
(+), (-), ($), (&&), (||), (==),
not, return, otherwise, fromIntegral, (/=), const)
import Data.Bits ((.&.), shiftR, shiftL)
import Data.Char (isLetter, isSpace, ord)

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant
import Control.Monad.ST (ST, runST)
import qualified Data.Text.Array as A
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, chr2, chr3, chr4)
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping)
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping, titleMapping)
import Data.Text.Internal (Text(..), safe)
import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iterArray)
import Data.Word (Word8)
import Data.Word (Word8, Word)

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant
import qualified GHC.Exts as Exts
import GHC.Int (Int64(..))

Expand Down Expand Up @@ -113,7 +118,7 @@ caseConvert ascii remap (Text src o l) = runST $ do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
pure $ dstOff + 2
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 2) dstOff'
3 -> do
let !(Exts.C# c) = chr3 m0 m1 m2
Expand All @@ -123,7 +128,7 @@ caseConvert ascii remap (Text src o l) = runST $ do
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
pure $ dstOff + 3
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 3) dstOff'
_ -> do
let !(Exts.C# c) = chr4 m0 m1 m2 m3
Expand All @@ -134,45 +139,140 @@ caseConvert ascii remap (Text src o l) = runST $ do
A.unsafeWrite dst (dstOff + 2) m2
A.unsafeWrite dst (dstOff + 3) m3
pure $ dstOff + 4
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 4) dstOff'

writeMapping :: Int64 -> Int -> ST s Int
writeMapping 0 dstOff = pure dstOff
writeMapping i dstOff = do
let (ch, j) = chopOffChar i
d <- unsafeWrite dst dstOff ch
writeMapping j (dstOff + d)

chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (Exts.I# n) = Exts.C# (Exts.chr# n)
mask = (1 `shiftL` 21) - 1
a = P.fromIntegral $ ab .&. mask
{-# INLINE caseConvert #-}

writeMapping :: A.MArray s -> Int64 -> Int -> ST s Int
writeMapping !_ 0 !dstOff = pure dstOff
writeMapping dst i dstOff = do
let (ch, j) = chopOffChar i
d <- unsafeWrite dst dstOff ch
writeMapping dst j (dstOff + d)

chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (Exts.I# n) = Exts.C# (Exts.chr# n)
mask = (1 `shiftL` 21) - 1
a = P.fromIntegral $ ab .&. mask

-- | /O(n)/ Convert a string to folded case.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toCaseFoldNonEmpty :: Text -> Text
toCaseFoldNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) foldMapping xs
toCaseFoldNonEmpty = \xs -> caseConvert asciiToLower foldMapping xs
{-# INLINE toCaseFoldNonEmpty #-}

-- | /O(n)/ Convert a string to lower case, using simple case
-- conversion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toLowerNonEmpty :: Text -> Text
toLowerNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) lowerMapping xs
toLowerNonEmpty = \xs -> caseConvert asciiToLower lowerMapping xs
{-# INLINE toLowerNonEmpty #-}

-- | /O(n)/ Convert a string to upper case, using simple case
-- conversion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toUpperNonEmpty :: Text -> Text
toUpperNonEmpty = \xs -> caseConvert (\w -> if w - 97 <= 25 then w - 32 else w) upperMapping xs
toUpperNonEmpty = \xs -> caseConvert asciiToUpper upperMapping xs
{-# INLINE toUpperNonEmpty #-}

asciiToLower :: Word8 -> Word8
asciiToLower w = if w - 65 <= 25 then w + 32 else w

asciiToUpper :: Word8 -> Word8
asciiToUpper w = if w - 97 <= 25 then w - 32 else w

isAsciiLetter :: Word8 -> Bool
isAsciiLetter w = w - 65 <= 25 || w - 97 <= 25

isAsciiSpace :: Word8 -> Bool
isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5)

-- | /O(n)/ Convert a string to title case, see 'Data.Text.toTitle' for discussion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toTitleNonEmpty :: Text -> Text
toTitleNonEmpty (Text src o l) = runST $ do
-- Case conversion a single code point may produce up to 3 code-points,
-- each up to 4 bytes, so 12 in total.
dst <- A.new (l + 12)
outer dst l o 0 False
where
outer :: forall s. A.MArray s -> Int -> Int -> Int -> Bool -> ST s Text
outer !dst !dstLen = inner
where
inner !srcOff !dstOff !mode
| srcOff >= o + l = do
A.shrinkM dst dstOff
arr <- A.unsafeFreeze dst
return (Text arr 0 dstOff)
| dstOff + 12 > dstLen = do
-- Ensure to extend the buffer by at least 12 bytes.
let !dstLen' = dstLen + max 12 (l + o - srcOff)
dst' <- A.resizeM dst dstLen'
outer dst' dstLen' srcOff dstOff mode
-- If a character is to remain unchanged, no need to decode Char back into UTF8,
-- just copy bytes from input.
| otherwise = do
let m0 = A.unsafeIndex src srcOff
m1 = A.unsafeIndex src (srcOff + 1)
m2 = A.unsafeIndex src (srcOff + 2)
m3 = A.unsafeIndex src (srcOff + 3)
!d = utf8LengthByLeader m0

case d of
1 -> do
let (mode', m0') = asciiAdvance mode m0
A.unsafeWrite dst dstOff m0'
inner (srcOff + 1) (dstOff + 1) mode'
2 -> do
let !(Exts.C# c) = chr2 m0 m1
!(# mode', c' #) = advance (\_ -> m0 == 0xC2 && m1 == 0xA0) mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
pure $ dstOff + 2
i -> writeMapping dst i dstOff
inner (srcOff + 2) dstOff' mode'
3 -> do
let !(Exts.C# c) = chr3 m0 m1 m2
isSpace3 ch
= m0 == 0xE1 && m1 == 0x9A && m2 == 0x80
|| m0 == 0xE2 && (m1 == 0x80 && isSpace (Exts.C# ch) || m1 == 0x81 && m2 == 0x9F)
|| m0 == 0xE3 && m1 == 0x80 && m2 == 0x80
!(# mode', c' #) = advance isSpace3 mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
pure $ dstOff + 3
i -> writeMapping dst i dstOff
inner (srcOff + 3) dstOff' mode'
_ -> do
let !(Exts.C# c) = chr4 m0 m1 m2 m3
!(# mode', c' #) = advance (\_ -> False) mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
A.unsafeWrite dst (dstOff + 3) m3
pure $ dstOff + 4
i -> writeMapping dst i dstOff
inner (srcOff + 4) dstOff' mode'

asciiAdvance :: Bool -> Word8 -> (Bool, Word8)
asciiAdvance False w = (isAsciiLetter w, asciiToUpper w)
asciiAdvance True w = (not (isAsciiSpace w), asciiToLower w)

advance :: (Exts.Char# -> Bool) -> Bool -> Exts.Char# -> (# Bool, _ {- unboxed Int64 -} #)
advance _ False c = (# isLetter (Exts.C# c), titleMapping c #)
advance isSpaceChar True c = (# not (isSpaceChar c), lowerMapping c #)
{-# INLINE advance #-}

-- | /O(n)/ 'filter_', applied to a continuation, a predicate and a @Text@,
-- calls the continuation with the @Text@ containing only the characters satisfying the predicate.
filter_ :: forall a. (A.Array -> Int -> Int -> a) -> (Char -> Bool) -> Text -> a
Expand Down

0 comments on commit e954933

Please sign in to comment.