Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make empty stream behavior of splitSepBySeq_ similar to splitSepBy_ #2912

Merged
merged 6 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/Streamly/Benchmark/Data/Stream/Split.hs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ toarr = Array.fromList . map (fromIntegral . ord)
splitOn :: Handle -> IO Int
splitOn inh =
(Stream.fold Fold.length
$ Stream.splitOn (== lf) Fold.drain
$ Stream.splitSepBy_ (== lf) Fold.drain
$ Handle.read inh) -- >>= print

#ifdef INSPECTION
Expand Down
142 changes: 131 additions & 11 deletions core/src/Streamly/Internal/Data/Stream/Nesting.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2758,12 +2758,15 @@ data SplitOnSeqState mba rb rh ck w fs s b x =

| SplitOnSeqEmpty !fs s

| SplitOnSeqSingle0 !fs s x
| SplitOnSeqSingle !fs s x

| SplitOnSeqWordInit !fs s
| SplitOnSeqWordInit0 !fs s
| SplitOnSeqWordInit Int Word !fs s
| SplitOnSeqWordLoop !w s !fs
| SplitOnSeqWordDone Int !fs !w

| SplitOnSeqKRInit0 Int !fs s mba
| SplitOnSeqKRInit Int !fs s mba
| SplitOnSeqKRLoop fs s mba !rh !ck
| SplitOnSeqKRCheck fs s mba !rh
Expand All @@ -2773,14 +2776,36 @@ data SplitOnSeqState mba rb rh ck w fs s b x =

-- XXX Need to fix empty stream split behavior

-- | Like 'splitOn' but splits the stream on a sequence of elements rather than
-- | Like 'splitSepBy_' but splits the stream on a sequence of elements rather than
-- a single element. Parses a sequence of tokens separated by an infixed
-- separator e.g. @a;b;c@ is parsed as @a@, @b@, @c@. If the pattern is empty
-- then each element is a match, thus the fold is finalized on each element.
--
-- Equivalent to the following:
-- >>> splitSepBy p xs = Stream.fold Fold.toList $ Stream.splitSepBySeq_ (Array.fromList p) Fold.toList (Stream.fromList xs)
--
-- >>> splitSepBy "" ""
-- []
--
-- >>> splitSepBy "" "a...b"
-- ["a",".",".",".","b"]
--
-- >>> splitSepBy ".." ""
-- []
--
-- >>> splitSepBy ".." "a...b"
-- ["a",".b"]
--
-- >>> splitSepBy ".." "abc"
-- ["abc"]
--
-- >>> splitSepBySeq_ pat f = Stream.foldManyPost (Fold.takeEndBySeq_ pat f)
-- >>> splitSepBy ".." ".."
-- ["",""]
--
-- >>> splitSepBy "." ".a"
-- ["","a"]
--
-- >>> splitSepBy "." "a."
-- ["a",""]
--
-- Uses Rabin-Karp algorithm for substring search.
--
Expand Down Expand Up @@ -2846,13 +2871,13 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) =
return $ Skip $ SplitOnSeqEmpty acc state
| patLen == 1 -> do
pat <- liftIO $ A.unsafeGetIndexIO 0 patArr
return $ Skip $ SplitOnSeqSingle acc state pat
return $ Skip $ SplitOnSeqSingle0 acc state pat
| SIZE_OF(a) * patLen <= sizeOf (Proxy :: Proxy Word) ->
return $ Skip $ SplitOnSeqWordInit acc state
return $ Skip $ SplitOnSeqWordInit0 acc state
| otherwise -> do
(MutArray mba _ _ _) :: MutArray a <-
liftIO $ MutArray.emptyOf patLen
skip $ SplitOnSeqKRInit 0 acc state mba
skip $ SplitOnSeqKRInit0 0 acc state mba
FL.Done b -> skip $ SplitOnSeqYield b SplitOnSeqInit

stepOuter _ (SplitOnSeqYield x next) = return $ Yield x next
Expand Down Expand Up @@ -2892,6 +2917,23 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) =
-- Single Pattern
-----------------

stepOuter gst (SplitOnSeqSingle0 fs st pat) = do
res <- step (adaptState gst) st
case res of
Yield x s -> do
-- XXX This code block is duplicated in SplitOnSeqSingle state
let jump c = SplitOnSeqSingle c s pat
if pat == x
then final fs >>= yieldReinit jump
else do
r <- fstep fs x
case r of
FL.Partial fs1 ->
pure $ Skip $ SplitOnSeqSingle fs1 s pat
FL.Done b -> yieldReinit jump b
Skip s -> pure $ Skip $ SplitOnSeqSingle0 fs s pat
Stop -> final fs >> pure Stop
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See splitSepBy_ . We should have the same behavior. If we have not consumed any element we should not initialize the fold. We will be perform initialization and finalization unnecessarily.


stepOuter gst (SplitOnSeqSingle fs0 st0 pat) = do
go SPEC fs0 st0

Expand Down Expand Up @@ -2938,8 +2980,17 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) =
let jump c = SplitOnSeqWordDone (n - 1) c wrd
yieldReinit jump b

stepOuter gst (SplitOnSeqWordInit fs st0) =
go SPEC 0 0 st0
stepOuter gst (SplitOnSeqWordInit0 fs st) = do
res <- step (adaptState gst) st
case res of
Yield x s ->
let wrd1 = addToWord 0 x
in pure $ Skip $ SplitOnSeqWordInit 1 wrd1 fs s
Skip s -> pure $ Skip $ SplitOnSeqWordInit0 fs s
Stop -> final fs >> pure Stop

stepOuter gst (SplitOnSeqWordInit idx0 wrd0 fs st0) =
go SPEC idx0 wrd0 st0

where

Expand All @@ -2953,7 +3004,7 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) =
then do
if wrd1 .&. wordMask == wordPat
then do
let jump c = SplitOnSeqWordInit c s
let jump c = SplitOnSeqWordInit 0 0 c s
final fs >>= yieldReinit jump
else skip $ SplitOnSeqWordLoop wrd1 s fs
else go SPEC (idx + 1) wrd1 s
Expand All @@ -2977,7 +3028,7 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) =
res <- step (adaptState gst) st
case res of
Yield x s -> do
let jump c = SplitOnSeqWordInit c s
let jump c = SplitOnSeqWordInit 0 0 c s
wrd1 = addToWord wrd x
old = (wordMask .&. wrd)
`shiftR` (elemBits * (patLen - 1))
Expand All @@ -3000,6 +3051,15 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) =
-- manipulated locally e.g. we are passing only mba, here and build an
-- array using patLen and arrStart from the surrounding context.

stepOuter gst (SplitOnSeqKRInit0 offset fs st mba) = do
res <- step (adaptState gst) st
case res of
Yield x s -> do
liftIO $ pokeAt offset mba x
skip $ SplitOnSeqKRInit (offset + SIZE_OF(a)) fs s mba
Skip s -> skip $ SplitOnSeqKRInit0 offset fs s mba
Stop -> final fs >> pure Stop

stepOuter gst (SplitOnSeqKRInit offset fs st mba) = do
res <- step (adaptState gst) st
case res of
Expand Down Expand Up @@ -3517,6 +3577,37 @@ splitOnSuffixSeq withSep patArr (Fold fstep initial _ final) (Stream step state)
--
-- >>> splitEndBySeq pat f = Stream.foldMany (Fold.takeEndBySeq pat f)
--
-- Usage:
--
-- >>> f p = Stream.splitEndBySeq (Array.fromList p) Fold.toList
-- >>> splitEndBy p xs = Stream.fold Fold.toList $ f p (Stream.fromList xs)
--
-- >>> splitEndBy "" ""
-- []
--
-- >>> splitEndBy "" "a...b"
-- ["a",".",".",".","b"]
--
-- >>> splitEndBy ".." ""
-- []
--
--
-- >>> splitEndBy ".." "a...b"
-- ["a..",".b"]
--
--
-- >>> splitEndBy ".." "abc"
-- ["abc"]
--
-- >>> splitEndBy ".." ".."
-- [".."]
--
-- >>> splitEndBy "." ".a"
-- [".","a"]
--
-- >>> splitEndBy "." "a."
-- ["a."]
--
-- Uses Rabin-Karp algorithm for substring search.
--
{-# INLINE_NORMAL splitEndBySeq #-}
Expand All @@ -3534,6 +3625,35 @@ splitEndBySeq = splitOnSuffixSeq True
--
-- >>> splitEndBySeq_ pat f = Stream.foldMany (Fold.takeEndBySeq_ pat f)
--
-- Usage:
--
-- >>> f p = Stream.splitEndBySeq_ (Array.fromList p) Fold.toList
-- >>> splitEndBy_ p xs = Stream.fold Fold.toList $ f p (Stream.fromList xs)
--
-- >>> splitEndBy_ "" ""
-- []
--
-- >>> splitEndBy_ "" "a...b"
-- ["a",".",".",".","b"]
--
-- >>> splitEndBy_ ".." ""
-- []
--
-- >>> splitEndBy_ ".." "a...b"
-- ["a",".b"]
--
-- >>> splitEndBy_ ".." "abc"
-- ["abc"]
--
-- >>> splitEndBy_ ".." ".."
-- [""]
--
-- >>> splitEndBy_ "." ".a"
-- ["","a"]
--
-- >>> splitEndBy_ "." "a."
-- ["a"]
--
-- Uses Rabin-Karp algorithm for substring search.
--
{-# INLINE_NORMAL splitEndBySeq_ #-}
Expand Down
37 changes: 23 additions & 14 deletions core/src/Streamly/Internal/Data/Stream/Transform.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2153,7 +2153,7 @@ catEithers = fmap (either id id)
data SplitSepBy s fs b a
= SplitSepByInit s
| SplitSepByInitFold0 s
| SplitSepByInitFold1 s a
| SplitSepByInitFold1 s fs
| SplitSepByCheck s a fs
| SplitSepByNext s fs
| SplitSepByYield b (SplitSepBy s fs b a)
Expand Down Expand Up @@ -2224,29 +2224,38 @@ splitSepBy_ predicate (Fold fstep initial _ final) (Stream step1 state1) =

where

-- Note: there is a question of whether we should initialize the fold
-- before we run the stream or only after the stream yields an element. If
-- we initialize it before then we may have to discard an effect if the
-- stream does not yield anything. If we initialize it after then we may
-- have to discard the stream element if the fold terminates without
-- consuming anything. Though the state machine is simpler if we initialize
-- the fold first. Also, in most common cases the fold is not effectful.
-- On the other hand, in most cases the fold will not terminate without
-- consuming anything. So both ways are similar.
{-# INLINE_LATE step #-}
step gst (SplitSepByInit st) = do
r <- step1 (adaptState gst) st
case r of
Yield x s -> return $ Skip $ SplitSepByInitFold1 s x
Skip s -> return $ Skip (SplitSepByInit s)
Stop -> return Stop

step _ (SplitSepByInitFold0 st) = do
step _ (SplitSepByInit st) = do
fres <- initial
return
$ Skip
$ case fres of
FL.Done b -> SplitSepByYield b (SplitSepByInitFold0 st)
FL.Partial fs -> SplitSepByNext st fs
FL.Done b -> SplitSepByYield b (SplitSepByInit st)
FL.Partial fs -> SplitSepByInitFold1 st fs

step _ (SplitSepByInitFold1 st x) = do
step _ (SplitSepByInitFold0 st) = do
fres <- initial
return
$ Skip
$ case fres of
FL.Done b -> SplitSepByYield b (SplitSepByInitFold1 st x)
FL.Partial fs -> SplitSepByCheck st x fs
FL.Done b -> SplitSepByYield b (SplitSepByInitFold0 st)
FL.Partial fs -> SplitSepByNext st fs

step gst (SplitSepByInitFold1 st fs) = do
r <- step1 (adaptState gst) st
case r of
Yield x s -> return $ Skip $ SplitSepByCheck s x fs
Skip s -> return $ Skip (SplitSepByInitFold1 s fs)
Stop -> final fs >> return Stop

step _ (SplitSepByCheck st x fs) = do
if predicate x
Expand Down
2 changes: 1 addition & 1 deletion src/Streamly/Internal/Data/Stream/IsStream/Common.hs
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ foldManyPost f m = fromStreamD $ D.foldManyPost f (toStreamD m)
-- ["h","e","l","l","o"]
--
-- >>> splitOnSeq' "hello" ""
-- [""]
-- []
--
-- >>> splitOnSeq' "hello" "hello"
-- ["",""]
Expand Down
Loading
Loading