Skip to content

Commit

Permalink
Poking at the performance of loop microbenchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
rrnewton committed Dec 8, 2013
1 parent d430007 commit 045826c
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 11 deletions.
8 changes: 6 additions & 2 deletions haskell/par-collections/Data/Par/Range.hs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ instance Generator Range where
foldMP fn !inita (InclusiveRange st en _thresh) =
forAcc_ st en inita (flip fn)

{-# INLINE forM_ #-}
forM_ (InclusiveRange st en _thresh) fn = for_ st en fn

{-# INLINE forMP_ #-}
forMP_ (InclusiveRange st en _thresh) fn = for_ st en fn

-- | Enumerate the elements in a Range.
toList :: Range -> [Int]
Expand Down Expand Up @@ -327,13 +332,12 @@ for_ (start, end) fn = loop start


-- My own forM for numeric ranges (not requiring deforestation optimizations).
-- Inclusive start, exclusive end.
{-# INLINE for_ #-}
for_ :: Monad m => Int -> Int -> (Int -> m ()) -> m ()
for_ start end _fn | start > end = error "for_: start is greater than end"
for_ start end fn = loop start
where
loop !i | i == end = return ()
loop !i | i > end = return ()
| otherwise = do fn i; loop (i+1)

-- | Inclusive / Inclusive
Expand Down
35 changes: 26 additions & 9 deletions haskell/par-collections/tests/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,21 @@ case_seqfoldMP = do
-- Data.Atomic.Counter drops that time by a factor of three, i.e. 0.9s for 500M, or
-- twice as slow as the fold versions above.
--
-- (By the way, it's 2X slower to use the C.incrCounter atomic op than to use raw
-- Side note 1: it's 2X slower to use the C.incrCounter atomic op than to use raw
-- reads and writes, but some of this must be due to the fact that the
-- fetch-and-add primop is not an inline primop yet.)
-- fetch-and-add primop is not an inline primop yet.
--
-- Side note 2: Using a custom vs. default implementatino of forM_ for Ranges
-- didn't make a difference here.
case_seqfor1 :: Assertion
case_seqfor1 = do
assertEqual "For loop over a range of ints" expectedSum =<<
(timeit $ do
cnt <- C.newCounter 0
PC.forM_ (irange 1 size) $ \ i -> do
-- x <- C.readCounter cnt
-- C.writeCounter cnt $! x + fromIntegral i
C.incrCounter (fromIntegral i) cnt
x <- C.readCounter cnt
C.writeCounter cnt $! x + fromIntegral i
-- C.incrCounter (fromIntegral i) cnt
return ()
fmap fromIntegral $ C.readCounter cnt
-- cnt <- newIORef 0
Expand All @@ -86,11 +89,16 @@ case_seqfor1 = do
-- readIORef cnt
)

-- Very slow currently [2013.12.07]: 5M in 0.37s, a full 100X worse.
case_seqforMP :: Assertion
case_seqforMP = do
-- Very slow currently [2013.12.07]: 5M in 0.37s, a full 100X worse. Providing a
-- custom definition of forMP_ for Ranges (rather than the default) got this down to
-- 0.13s for 5M, but that is still abysmal.
--
-- Of course, if you perform the internalLiftIO once, outside the loop, the
-- performance is the same as seqfor1.
case_seqforMP1 :: Assertion
case_seqforMP1 = do
assertEqual "For loop over a range of ints in Par monad" expectedSum =<<
(timeit $ P.runParIO $ do
(timeit $ P.runParIO $ do
cnt <- internalLiftIO $ C.newCounter 0
PC.forMP_ (irange 1 size) $ \ i -> do
x <- internalLiftIO$ C.readCounter cnt
Expand All @@ -99,6 +107,15 @@ case_seqforMP = do
fmap fromIntegral $ internalLiftIO$ C.readCounter cnt
)

-- Do no work in this version, but run the loop in the par monad.
-- This one is reasonable speed, 0.29s for 500M.
case_seqforMP2 :: Assertion
case_seqforMP2 = do
assertEqual "For loop over a range of ints in Par monad" () =<<
(timeit $ P.runParIO $ do
PC.forMP_ (irange 1 size) $ \ i ->
return ()
)

-- --------------------------------------------------------------------------------

Expand Down

0 comments on commit 045826c

Please sign in to comment.