Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Data.Text.unpack and Data.Text.toTitle directly, without streaming #611

Merged
merged 5 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions benchmarks/haskell/Benchmarks/Pure.hs
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,10 @@ benchmark kind ~Env{..} =
[ benchT $ nf T.toUpper ta
, benchTL $ nf TL.toUpper tla
]
, bgroup "toTitle"
[ benchT $ nf T.toTitle ta
, benchTL $ nf TL.toTitle tla
]
, bgroup "uncons"
[ benchT $ nf T.uncons ta
, benchTL $ nf TL.uncons tla
Expand Down Expand Up @@ -269,6 +273,10 @@ benchmark kind ~Env{..} =
[ benchT $ nf (T.length . T.toUpper) ta
, benchTL $ nf (TL.length . TL.toUpper) tla
]
, bgroup "toTitle"
[ benchT $ nf (T.length . T.toTitle) ta
, benchTL $ nf (TL.length . TL.toTitle) tla
]
, bgroup "words"
[ benchT $ nf (L.length . T.words) ta
, benchTL $ nf (L.length . TL.words) tla
Expand Down
6 changes: 4 additions & 2 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
reverseIter_, unsafeHead, unsafeTail, iterArray, reverseIterArray)
import Data.Text.Internal.Search (indices)
import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, filter_)
import Data.Text.Internal.Transformation (mapNonEmpty, toCaseFoldNonEmpty, toLowerNonEmpty, toUpperNonEmpty, toTitleNonEmpty, filter_)
#if defined(__HADDOCK__)
import Data.ByteString (ByteString)
import qualified Data.Text.Lazy as L
Expand Down Expand Up @@ -900,7 +900,9 @@ toUpper = \t ->
--
-- @since 1.0.0.0
toTitle :: Text -> Text
toTitle t = unstream (S.toTitle (stream t))
toTitle = \t ->
if null t then empty
else toTitleNonEmpty t
{-# INLINE toTitle #-}

-- | /O(n)/ Left-justify a string to the given length, using the
Expand Down
152 changes: 126 additions & 26 deletions src/Data/Text/Internal/Transformation.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns, CPP, MagicHash #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE PartialTypeSignatures #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE UnliftedFFITypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE PartialTypeSignatures #-}

{-# OPTIONS_GHC -fno-warn-orphans #-}
{-# OPTIONS_GHC -Wno-partial-type-signatures #-}
Expand All @@ -25,24 +28,26 @@
, toCaseFoldNonEmpty
, toLowerNonEmpty
, toUpperNonEmpty
, toTitleNonEmpty
, filter_
) where

import Prelude (Char, Bool(..), Int,

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘/=, const, fromIntegral’

Check warning on line 35 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘/=, const, fromIntegral’
Ord(..),
Monad(..), pure,
(+), (-), ($),
not, return, otherwise)
(+), (-), ($), (&&), (||), (==),
not, return, otherwise, fromIntegral, (/=), const)
import Data.Bits ((.&.), shiftR, shiftL)
import Data.Char (isLetter, isSpace, ord)

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant

Check warning on line 41 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘ord’ from module ‘Data.Char’ is redundant
import Control.Monad.ST (ST, runST)
import qualified Data.Text.Array as A
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, chr2, chr3, chr4)
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping)
import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping, titleMapping)
import Data.Text.Internal (Text(..), safe)
import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iterArray)
import Data.Word (Word8)
import Data.Word (Word8, Word)

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (ubuntu-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant

Check warning on line 50 in src/Data/Text/Internal/Transformation.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

The import of ‘Word’ from module ‘Data.Word’ is redundant
import qualified GHC.Exts as Exts
import GHC.Int (Int64(..))

Expand Down Expand Up @@ -113,7 +118,7 @@
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
pure $ dstOff + 2
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 2) dstOff'
3 -> do
let !(Exts.C# c) = chr3 m0 m1 m2
Expand All @@ -123,7 +128,7 @@
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
pure $ dstOff + 3
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 3) dstOff'
_ -> do
let !(Exts.C# c) = chr4 m0 m1 m2 m3
Expand All @@ -134,45 +139,140 @@
A.unsafeWrite dst (dstOff + 2) m2
A.unsafeWrite dst (dstOff + 3) m3
pure $ dstOff + 4
i -> writeMapping i dstOff
i -> writeMapping dst i dstOff
inner (srcOff + 4) dstOff'

writeMapping :: Int64 -> Int -> ST s Int
writeMapping 0 dstOff = pure dstOff
writeMapping i dstOff = do
let (ch, j) = chopOffChar i
d <- unsafeWrite dst dstOff ch
writeMapping j (dstOff + d)

chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (Exts.I# n) = Exts.C# (Exts.chr# n)
mask = (1 `shiftL` 21) - 1
a = P.fromIntegral $ ab .&. mask
{-# INLINE caseConvert #-}

writeMapping :: A.MArray s -> Int64 -> Int -> ST s Int
writeMapping !_ 0 !dstOff = pure dstOff
writeMapping dst i dstOff = do
let (ch, j) = chopOffChar i
d <- unsafeWrite dst dstOff ch
writeMapping dst j (dstOff + d)

chopOffChar :: Int64 -> (Char, Int64)
chopOffChar ab = (chr a, ab `shiftR` 21)
where
chr (Exts.I# n) = Exts.C# (Exts.chr# n)
mask = (1 `shiftL` 21) - 1
a = P.fromIntegral $ ab .&. mask

-- | /O(n)/ Convert a string to folded case.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toCaseFoldNonEmpty :: Text -> Text
toCaseFoldNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) foldMapping xs
toCaseFoldNonEmpty = \xs -> caseConvert asciiToLower foldMapping xs
{-# INLINE toCaseFoldNonEmpty #-}

-- | /O(n)/ Convert a string to lower case, using simple case
-- conversion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toLowerNonEmpty :: Text -> Text
toLowerNonEmpty = \xs -> caseConvert (\w -> if w - 65 <= 25 then w + 32 else w) lowerMapping xs
toLowerNonEmpty = \xs -> caseConvert asciiToLower lowerMapping xs
{-# INLINE toLowerNonEmpty #-}

-- | /O(n)/ Convert a string to upper case, using simple case
-- conversion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toUpperNonEmpty :: Text -> Text
toUpperNonEmpty = \xs -> caseConvert (\w -> if w - 97 <= 25 then w - 32 else w) upperMapping xs
toUpperNonEmpty = \xs -> caseConvert asciiToUpper upperMapping xs
{-# INLINE toUpperNonEmpty #-}

asciiToLower :: Word8 -> Word8
asciiToLower w = if w - 65 <= 25 then w + 32 else w

asciiToUpper :: Word8 -> Word8
asciiToUpper w = if w - 97 <= 25 then w - 32 else w

isAsciiLetter :: Word8 -> Bool
isAsciiLetter w = w - 65 <= 25 || w - 97 <= 25

isAsciiSpace :: Word8 -> Bool
isAsciiSpace w = w .&. 0x50 == 0 && w < 0x80 && (w == 0x20 || w - 0x09 < 5)

-- | /O(n)/ Convert a string to title case, see 'Data.Text.toTitle' for discussion.
-- Assume that the @Text@ is non-empty. The returned @Text@ is guaranteed to be non-empty.
toTitleNonEmpty :: Text -> Text
toTitleNonEmpty (Text src o l) = runST $ do
-- Case conversion a single code point may produce up to 3 code-points,
-- each up to 4 bytes, so 12 in total.
dst <- A.new (l + 12)
outer dst l o 0 False
where
outer :: forall s. A.MArray s -> Int -> Int -> Int -> Bool -> ST s Text
outer !dst !dstLen = inner
where
inner !srcOff !dstOff !mode
| srcOff >= o + l = do
A.shrinkM dst dstOff
arr <- A.unsafeFreeze dst
return (Text arr 0 dstOff)
| dstOff + 12 > dstLen = do
-- Ensure to extend the buffer by at least 12 bytes.
let !dstLen' = dstLen + max 12 (l + o - srcOff)
dst' <- A.resizeM dst dstLen'
outer dst' dstLen' srcOff dstOff mode
-- If a character is to remain unchanged, no need to decode Char back into UTF8,
-- just copy bytes from input.
| otherwise = do
let m0 = A.unsafeIndex src srcOff
m1 = A.unsafeIndex src (srcOff + 1)
m2 = A.unsafeIndex src (srcOff + 2)
m3 = A.unsafeIndex src (srcOff + 3)
!d = utf8LengthByLeader m0

case d of
1 -> do
let (mode', m0') = asciiAdvance mode m0
A.unsafeWrite dst dstOff m0'
inner (srcOff + 1) (dstOff + 1) mode'
2 -> do
let !(Exts.C# c) = chr2 m0 m1
!(# mode', c' #) = advance (\_ -> m0 == 0xC2 && m1 == 0xA0) mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
pure $ dstOff + 2
i -> writeMapping dst i dstOff
inner (srcOff + 2) dstOff' mode'
3 -> do
let !(Exts.C# c) = chr3 m0 m1 m2
isSpace3 ch
= m0 == 0xE1 && m1 == 0x9A && m2 == 0x80
|| m0 == 0xE2 && (m1 == 0x80 && isSpace (Exts.C# ch) || m1 == 0x81 && m2 == 0x9F)
|| m0 == 0xE3 && m1 == 0x80 && m2 == 0x80
!(# mode', c' #) = advance isSpace3 mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
pure $ dstOff + 3
i -> writeMapping dst i dstOff
inner (srcOff + 3) dstOff' mode'
_ -> do
let !(Exts.C# c) = chr4 m0 m1 m2 m3
!(# mode', c' #) = advance (\_ -> False) mode c
dstOff' <- case I64# c' of
0 -> do
A.unsafeWrite dst dstOff m0
A.unsafeWrite dst (dstOff + 1) m1
A.unsafeWrite dst (dstOff + 2) m2
A.unsafeWrite dst (dstOff + 3) m3
pure $ dstOff + 4
i -> writeMapping dst i dstOff
inner (srcOff + 4) dstOff' mode'

asciiAdvance :: Bool -> Word8 -> (Bool, Word8)
asciiAdvance False w = (isAsciiLetter w, asciiToUpper w)
asciiAdvance True w = (not (isAsciiSpace w), asciiToLower w)

advance :: (Exts.Char# -> Bool) -> Bool -> Exts.Char# -> (# Bool, _ {- unboxed Int64 -} #)
advance _ False c = (# isLetter (Exts.C# c), titleMapping c #)
advance isSpaceChar True c = (# not (isSpaceChar c), lowerMapping c #)
{-# INLINE advance #-}

-- | /O(n)/ 'filter_', applied to a continuation, a predicate and a @Text@,
-- calls the continuation with the @Text@ containing only the characters satisfying the predicate.
filter_ :: forall a. (A.Array -> Int -> Int -> a) -> (Char -> Bool) -> Text -> a
Expand Down
15 changes: 10 additions & 5 deletions src/Data/Text/Show.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{-# LANGUAGE CPP, MagicHash #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CApiFFI #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE ViewPatterns #-}

{-# OPTIONS_GHC -fno-warn-orphans #-}
Expand All @@ -26,12 +28,11 @@ module Data.Text.Show
import Control.Monad.ST (ST, runST)
import Data.Text.Internal (Text(..), empty, safe, pack)
import Data.Text.Internal.Encoding.Utf8 (utf8Length)
import Data.Text.Internal.Fusion (stream)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Unsafe (Iter(..), iterArray)
import GHC.Exts (Ptr(..), Int(..), Addr#, indexWord8OffAddr#)
import GHC.Word (Word8(..))
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Fusion.Common as S
#if !MIN_VERSION_ghc_prim(0,7,0)
import Foreign.C.String (CString)
import Foreign.C.Types (CSize(..))
Expand All @@ -52,7 +53,11 @@ unpack ::
HasCallStack =>
#endif
Text -> String
unpack = S.unstreamList . stream
unpack (Text arr off len) = go off
where
go !i
| i >= off + len = []
| otherwise = let !(Iter c l) = iterArray arr i in c : go (i + l)
{-# INLINE [1] unpack #-}

-- | /O(n)/ Convert a null-terminated
Expand Down
6 changes: 5 additions & 1 deletion src/Data/Text/Unsafe.hs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ iter (Text arr off _len) i = iterArray arr (off + i)
{-# INLINE iter #-}

-- | @since 2.0
iterArray :: A.Array -> Int -> Iter
iterArray ::
#if defined(ASSERTS)
HasCallStack =>
#endif
A.Array -> Int -> Iter
iterArray arr j = Iter chr l
where m0 = A.unsafeIndex arr j
m1 = A.unsafeIndex arr (j+1)
Expand Down
1 change: 1 addition & 0 deletions tests/Tests/Properties/Basics.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

{-# OPTIONS_GHC -Wno-missing-signatures #-}
{-# OPTIONS_GHC -Wno-warnings-deprecations #-}
{-# OPTIONS_GHC -Wno-unrecognised-warning-flags #-}
{-# OPTIONS_GHC -Wno-x-partial #-}

module Tests.Properties.Basics
Expand Down
2 changes: 2 additions & 0 deletions tests/Tests/ShareEmpty.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE BangPatterns #-}

{-# OPTIONS_GHC -Wno-unrecognised-warning-flags #-}
{-# OPTIONS_GHC -Wno-x-partial #-}

module Tests.ShareEmpty
Expand Down
Loading