diff --git a/benchmark/Streamly/Benchmark/Data/Stream/Split.hs b/benchmark/Streamly/Benchmark/Data/Stream/Split.hs index 45a6a5102e..2180b450c2 100644 --- a/benchmark/Streamly/Benchmark/Data/Stream/Split.hs +++ b/benchmark/Streamly/Benchmark/Data/Stream/Split.hs @@ -61,7 +61,7 @@ toarr = Array.fromList . map (fromIntegral . ord) splitOn :: Handle -> IO Int splitOn inh = (Stream.fold Fold.length - $ Stream.splitOn (== lf) Fold.drain + $ Stream.splitSepBy_ (== lf) Fold.drain $ Handle.read inh) -- >>= print #ifdef INSPECTION diff --git a/core/src/Streamly/Internal/Data/Stream/Nesting.hs b/core/src/Streamly/Internal/Data/Stream/Nesting.hs index 013ec07d41..ce15394233 100644 --- a/core/src/Streamly/Internal/Data/Stream/Nesting.hs +++ b/core/src/Streamly/Internal/Data/Stream/Nesting.hs @@ -2758,12 +2758,15 @@ data SplitOnSeqState mba rb rh ck w fs s b x = | SplitOnSeqEmpty !fs s + | SplitOnSeqSingle0 !fs s x | SplitOnSeqSingle !fs s x - | SplitOnSeqWordInit !fs s + | SplitOnSeqWordInit0 !fs s + | SplitOnSeqWordInit Int Word !fs s | SplitOnSeqWordLoop !w s !fs | SplitOnSeqWordDone Int !fs !w + | SplitOnSeqKRInit0 Int !fs s mba | SplitOnSeqKRInit Int !fs s mba | SplitOnSeqKRLoop fs s mba !rh !ck | SplitOnSeqKRCheck fs s mba !rh @@ -2773,14 +2776,36 @@ data SplitOnSeqState mba rb rh ck w fs s b x = -- XXX Need to fix empty stream split behavior --- | Like 'splitOn' but splits the stream on a sequence of elements rather than +-- | Like 'splitSepBy_' but splits the stream on a sequence of elements rather than -- a single element. Parses a sequence of tokens separated by an infixed -- separator e.g. @a;b;c@ is parsed as @a@, @b@, @c@. If the pattern is empty -- then each element is a match, thus the fold is finalized on each element. -- --- Equivalent to the following: +-- >>> splitSepBy p xs = Stream.fold Fold.toList $ Stream.splitSepBySeq_ (Array.fromList p) Fold.toList (Stream.fromList xs) +-- +-- >>> splitSepBy "" "" +-- [] +-- +-- >>> splitSepBy "" "a...b" +-- ["a",".",".",".","b"] +-- +-- >>> splitSepBy ".." "" +-- [] +-- +-- >>> splitSepBy ".." "a...b" +-- ["a",".b"] +-- +-- >>> splitSepBy ".." "abc" +-- ["abc"] -- --- >>> splitSepBySeq_ pat f = Stream.foldManyPost (Fold.takeEndBySeq_ pat f) +-- >>> splitSepBy ".." ".." +-- ["",""] +-- +-- >>> splitSepBy "." ".a" +-- ["","a"] +-- +-- >>> splitSepBy "." "a." +-- ["a",""] -- -- Uses Rabin-Karp algorithm for substring search. -- @@ -2846,13 +2871,13 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) = return $ Skip $ SplitOnSeqEmpty acc state | patLen == 1 -> do pat <- liftIO $ A.unsafeGetIndexIO 0 patArr - return $ Skip $ SplitOnSeqSingle acc state pat + return $ Skip $ SplitOnSeqSingle0 acc state pat | SIZE_OF(a) * patLen <= sizeOf (Proxy :: Proxy Word) -> - return $ Skip $ SplitOnSeqWordInit acc state + return $ Skip $ SplitOnSeqWordInit0 acc state | otherwise -> do (MutArray mba _ _ _) :: MutArray a <- liftIO $ MutArray.emptyOf patLen - skip $ SplitOnSeqKRInit 0 acc state mba + skip $ SplitOnSeqKRInit0 0 acc state mba FL.Done b -> skip $ SplitOnSeqYield b SplitOnSeqInit stepOuter _ (SplitOnSeqYield x next) = return $ Yield x next @@ -2892,6 +2917,23 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) = -- Single Pattern ----------------- + stepOuter gst (SplitOnSeqSingle0 fs st pat) = do + res <- step (adaptState gst) st + case res of + Yield x s -> do + -- XXX This code block is duplicated in SplitOnSeqSingle state + let jump c = SplitOnSeqSingle c s pat + if pat == x + then final fs >>= yieldReinit jump + else do + r <- fstep fs x + case r of + FL.Partial fs1 -> + pure $ Skip $ SplitOnSeqSingle fs1 s pat + FL.Done b -> yieldReinit jump b + Skip s -> pure $ Skip $ SplitOnSeqSingle0 fs s pat + Stop -> final fs >> pure Stop + stepOuter gst (SplitOnSeqSingle fs0 st0 pat) = do go SPEC fs0 st0 @@ -2938,8 +2980,17 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) = let jump c = SplitOnSeqWordDone (n - 1) c wrd yieldReinit jump b - stepOuter gst (SplitOnSeqWordInit fs st0) = - go SPEC 0 0 st0 + stepOuter gst (SplitOnSeqWordInit0 fs st) = do + res <- step (adaptState gst) st + case res of + Yield x s -> + let wrd1 = addToWord 0 x + in pure $ Skip $ SplitOnSeqWordInit 1 wrd1 fs s + Skip s -> pure $ Skip $ SplitOnSeqWordInit0 fs s + Stop -> final fs >> pure Stop + + stepOuter gst (SplitOnSeqWordInit idx0 wrd0 fs st0) = + go SPEC idx0 wrd0 st0 where @@ -2953,7 +3004,7 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) = then do if wrd1 .&. wordMask == wordPat then do - let jump c = SplitOnSeqWordInit c s + let jump c = SplitOnSeqWordInit 0 0 c s final fs >>= yieldReinit jump else skip $ SplitOnSeqWordLoop wrd1 s fs else go SPEC (idx + 1) wrd1 s @@ -2977,7 +3028,7 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) = res <- step (adaptState gst) st case res of Yield x s -> do - let jump c = SplitOnSeqWordInit c s + let jump c = SplitOnSeqWordInit 0 0 c s wrd1 = addToWord wrd x old = (wordMask .&. wrd) `shiftR` (elemBits * (patLen - 1)) @@ -3000,6 +3051,15 @@ splitSepBySeq_ patArr (Fold fstep initial _ final) (Stream step state) = -- manipulated locally e.g. we are passing only mba, here and build an -- array using patLen and arrStart from the surrounding context. + stepOuter gst (SplitOnSeqKRInit0 offset fs st mba) = do + res <- step (adaptState gst) st + case res of + Yield x s -> do + liftIO $ pokeAt offset mba x + skip $ SplitOnSeqKRInit (offset + SIZE_OF(a)) fs s mba + Skip s -> skip $ SplitOnSeqKRInit0 offset fs s mba + Stop -> final fs >> pure Stop + stepOuter gst (SplitOnSeqKRInit offset fs st mba) = do res <- step (adaptState gst) st case res of @@ -3517,6 +3577,37 @@ splitOnSuffixSeq withSep patArr (Fold fstep initial _ final) (Stream step state) -- -- >>> splitEndBySeq pat f = Stream.foldMany (Fold.takeEndBySeq pat f) -- +-- Usage: +-- +-- >>> f p = Stream.splitEndBySeq (Array.fromList p) Fold.toList +-- >>> splitEndBy p xs = Stream.fold Fold.toList $ f p (Stream.fromList xs) +-- +-- >>> splitEndBy "" "" +-- [] +-- +-- >>> splitEndBy "" "a...b" +-- ["a",".",".",".","b"] +-- +-- >>> splitEndBy ".." "" +-- [] +-- +-- +-- >>> splitEndBy ".." "a...b" +-- ["a..",".b"] +-- +-- +-- >>> splitEndBy ".." "abc" +-- ["abc"] +-- +-- >>> splitEndBy ".." ".." +-- [".."] +-- +-- >>> splitEndBy "." ".a" +-- [".","a"] +-- +-- >>> splitEndBy "." "a." +-- ["a."] +-- -- Uses Rabin-Karp algorithm for substring search. -- {-# INLINE_NORMAL splitEndBySeq #-} @@ -3534,6 +3625,35 @@ splitEndBySeq = splitOnSuffixSeq True -- -- >>> splitEndBySeq_ pat f = Stream.foldMany (Fold.takeEndBySeq_ pat f) -- +-- Usage: +-- +-- >>> f p = Stream.splitEndBySeq_ (Array.fromList p) Fold.toList +-- >>> splitEndBy_ p xs = Stream.fold Fold.toList $ f p (Stream.fromList xs) +-- +-- >>> splitEndBy_ "" "" +-- [] +-- +-- >>> splitEndBy_ "" "a...b" +-- ["a",".",".",".","b"] +-- +-- >>> splitEndBy_ ".." "" +-- [] +-- +-- >>> splitEndBy_ ".." "a...b" +-- ["a",".b"] +-- +-- >>> splitEndBy_ ".." "abc" +-- ["abc"] +-- +-- >>> splitEndBy_ ".." ".." +-- [""] +-- +-- >>> splitEndBy_ "." ".a" +-- ["","a"] +-- +-- >>> splitEndBy_ "." "a." +-- ["a"] +-- -- Uses Rabin-Karp algorithm for substring search. -- {-# INLINE_NORMAL splitEndBySeq_ #-} diff --git a/core/src/Streamly/Internal/Data/Stream/Transform.hs b/core/src/Streamly/Internal/Data/Stream/Transform.hs index e7e0586c05..85b0ac59fb 100644 --- a/core/src/Streamly/Internal/Data/Stream/Transform.hs +++ b/core/src/Streamly/Internal/Data/Stream/Transform.hs @@ -2153,7 +2153,7 @@ catEithers = fmap (either id id) data SplitSepBy s fs b a = SplitSepByInit s | SplitSepByInitFold0 s - | SplitSepByInitFold1 s a + | SplitSepByInitFold1 s fs | SplitSepByCheck s a fs | SplitSepByNext s fs | SplitSepByYield b (SplitSepBy s fs b a) @@ -2224,29 +2224,38 @@ splitSepBy_ predicate (Fold fstep initial _ final) (Stream step1 state1) = where + -- Note: there is a question of whether we should initialize the fold + -- before we run the stream or only after the stream yields an element. If + -- we initialize it before then we may have to discard an effect if the + -- stream does not yield anything. If we initialize it after then we may + -- have to discard the stream element if the fold terminates without + -- consuming anything. Though the state machine is simpler if we initialize + -- the fold first. Also, in most common cases the fold is not effectful. + -- On the other hand, in most cases the fold will not terminate without + -- consuming anything. So both ways are similar. {-# INLINE_LATE step #-} - step gst (SplitSepByInit st) = do - r <- step1 (adaptState gst) st - case r of - Yield x s -> return $ Skip $ SplitSepByInitFold1 s x - Skip s -> return $ Skip (SplitSepByInit s) - Stop -> return Stop - - step _ (SplitSepByInitFold0 st) = do + step _ (SplitSepByInit st) = do fres <- initial return $ Skip $ case fres of - FL.Done b -> SplitSepByYield b (SplitSepByInitFold0 st) - FL.Partial fs -> SplitSepByNext st fs + FL.Done b -> SplitSepByYield b (SplitSepByInit st) + FL.Partial fs -> SplitSepByInitFold1 st fs - step _ (SplitSepByInitFold1 st x) = do + step _ (SplitSepByInitFold0 st) = do fres <- initial return $ Skip $ case fres of - FL.Done b -> SplitSepByYield b (SplitSepByInitFold1 st x) - FL.Partial fs -> SplitSepByCheck st x fs + FL.Done b -> SplitSepByYield b (SplitSepByInitFold0 st) + FL.Partial fs -> SplitSepByNext st fs + + step gst (SplitSepByInitFold1 st fs) = do + r <- step1 (adaptState gst) st + case r of + Yield x s -> return $ Skip $ SplitSepByCheck s x fs + Skip s -> return $ Skip (SplitSepByInitFold1 s fs) + Stop -> final fs >> return Stop step _ (SplitSepByCheck st x fs) = do if predicate x diff --git a/src/Streamly/Internal/Data/Stream/IsStream/Common.hs b/src/Streamly/Internal/Data/Stream/IsStream/Common.hs index 1752820499..852ffd2afa 100644 --- a/src/Streamly/Internal/Data/Stream/IsStream/Common.hs +++ b/src/Streamly/Internal/Data/Stream/IsStream/Common.hs @@ -662,7 +662,7 @@ foldManyPost f m = fromStreamD $ D.foldManyPost f (toStreamD m) -- ["h","e","l","l","o"] -- -- >>> splitOnSeq' "hello" "" --- [""] +-- [] -- -- >>> splitOnSeq' "hello" "hello" -- ["",""] diff --git a/test/Streamly/Test/Data/Stream.hs b/test/Streamly/Test/Data/Stream.hs index c19338c565..557f075461 100644 --- a/test/Streamly/Test/Data/Stream.hs +++ b/test/Streamly/Test/Data/Stream.hs @@ -103,8 +103,12 @@ splitOnSeq :: splitOnSeq op = do describe "Tests for splitOnSeq" $ do -- Empty pattern case + it "splitOnSeq_ \"\" \"\" = []" + $ splitOnSeq_ "" "" `shouldReturn` [] -- Single element pattern cases + it "splitOnSeq_ \"x\" \"\" = []" + $ splitOnSeq_ "x" "" `shouldReturn` [] it "splitOnSeq_ \"x\" \"hello\" = [\"hello\"]" $ splitOnSeq_ "x" "hello" `shouldReturn` ["hello"] it "splitOnSeq_ \"h\" \"hello\" = [\"\", \"ello\"]" @@ -117,6 +121,8 @@ splitOnSeq op = do $ splitOnSeq_ "o" "hello" `shouldReturn` ["hell", ""] -- multi-element pattern fitting in a Word + it "splitOnSeq_ \"he\" \"\" = []" + $ splitOnSeq_ "he" "" `shouldReturn` [] it "splitOnSeq_ \"he\" \"hello\" = [\"\", \"llo\"]" $ splitOnSeq_ "he" "hello" `shouldReturn` ["", "llo"] it "splitOnSeq_ \"ll\" \"hello\" = [\"he\", \"o\"]" @@ -125,8 +131,8 @@ splitOnSeq op = do $ splitOnSeq_ "lo" "hello" `shouldReturn` ["hel", ""] -- multi-element pattern - Rabin-Karp cases - it "splitOnSeq_ \"hello\" \"\" = [\"\"]" - $ splitOnSeq_ "hello" "" `shouldReturn` [""] + it "splitOnSeq_ \"hello\" \"\" = []" + $ splitOnSeq_ "hello" "" `shouldReturn` [] it "splitOnSeq_ \"hel\" \"hello\" = [\"\", \"lo\"]" $ splitOnSeq_ "hel" "hello" `shouldReturn` ["", "lo"] it "splitOnSeq_ \"ell\" \"hello\" = [\"h\", \"o\"]" @@ -403,7 +409,12 @@ intercalateSplitOnId x desc = groupSplitOps :: String -> Spec groupSplitOps desc = do -- splitting - splitOnSeq splitOnSeqFold + + -- The foldManyPost implementation on an empty stream produces a single + -- value. The behaviour of foldManyPost implementation and the direct stream + -- implementation is not different. + -- splitOnSeq splitOnSeqFold + splitOnSeq splitOnSeqStream splitOnSuffixSeq splitOnSuffixSeqFold @@ -411,17 +422,80 @@ groupSplitOps desc = do splitOnSuffixSeq splitOnSuffixSeqStream -- Some ad-hoc tests it "splitEndBySeq word hash cases" $ do - let f input result = + let f sep input result = + Stream.toList + ( Stream.splitEndBySeq (Array.fromList sep) Fold.toList + $ Stream.fromList input + ) `shouldReturn` result + + f "ab" "a" ["a"] + f "ab" "ab" ["ab"] + f "ab" "aba" ["ab","a"] + f "ab" "abab" ["ab","ab"] + f "ab" "abc" ["ab","c"] + f "ab" "xab" ["xab"] + f "" "" [] + f "." "" [] + f ".." "" [] + f "..." "" [] + f "" "a...b" ["a",".",".",".","b"] + f "." "a...b" ["a.",".",".","b"] + f ".." "a...b" ["a..",".b"] + f "..." "a...b" ["a...","b"] + f "." "abc" ["abc"] + f ".." "abc" ["abc"] + f "..." "abc" ["abc"] + f "." "." ["."] + f ".." ".." [".."] + f "..." "..." ["..."] + f "." ".a" [".","a"] + f "." "a." ["a."] + + it "splitEndBySeq_ word hash cases" $ do + let f sep input result = + Stream.toList + ( Stream.splitEndBySeq_ (Array.fromList sep) Fold.toList + $ Stream.fromList input + ) `shouldReturn` result + f "" "" [] + f "." "" [] + f ".." "" [] + f "..." "" [] + f "" "a...b" ["a",".",".",".","b"] + f "." "a...b" ["a","","","b"] + f ".." "a...b" ["a",".b"] + f "..." "a...b" ["a","b"] + f "." "abc" ["abc"] + f ".." "abc" ["abc"] + f "..." "abc" ["abc"] + f "." "." [""] + f ".." ".." [""] + f "..." "..." [""] + f "." ".a" ["","a"] + f "." "a." ["a"] + + it "splitSepBySeq_ word hash cases" $ do + let f sep input result = Stream.toList - ( Stream.splitEndBySeq (Array.fromList "ab") Fold.toList + ( Stream.splitSepBySeq_ (Array.fromList sep) Fold.toList $ Stream.fromList input ) `shouldReturn` result - f "a" ["a"] - f "ab" ["ab"] - f "aba" ["ab","a"] - f "abab" ["ab","ab"] - f "abc" ["ab","c"] - f "xab" ["xab"] + f "" "" [] + f "." "" [] + f ".." "" [] + f "..." "" [] + f "" "a...b" ["a",".",".",".","b"] + f "." "a...b" ["a","","","b"] + f ".." "a...b" ["a",".b"] + f "..." "a...b" ["a","b"] + f "." "abc" ["abc"] + f ".." "abc" ["abc"] + f "..." "abc" ["abc"] + f "." "." ["",""] + f ".." ".." ["",""] + f "..." "..." ["",""] + f "." ".a" ["","a"] + f "." "a." ["a",""] let takeEndBySeq pat input result = Stream.toList