Poking at the performance of loop microbenchmarks

iu-parfunc · Dec 8, 2013 · 045826c · 045826c
1 parent d430007
commit 045826c
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 11 deletions.
diff --git a/haskell/par-collections/Data/Par/Range.hs b/haskell/par-collections/Data/Par/Range.hs
@@ -89,6 +89,11 @@ instance Generator Range where
   foldMP fn !inita (InclusiveRange st en _thresh) =
     forAcc_ st en inita (flip fn)
 
+  {-# INLINE forM_ #-}
+  forM_ (InclusiveRange st en _thresh) fn = for_ st en fn
+
+  {-# INLINE forMP_ #-}
+  forMP_ (InclusiveRange st en _thresh) fn = for_ st en fn
 
 -- | Enumerate the elements in a Range.
 toList :: Range -> [Int]
@@ -327,13 +332,12 @@ for_ (start, end) fn = loop start
 
 
 -- My own forM for numeric ranges (not requiring deforestation optimizations).
--- Inclusive start, exclusive end.
 {-# INLINE for_ #-}
 for_ :: Monad m => Int -> Int -> (Int -> m ()) -> m ()
 for_ start end _fn | start > end = error "for_: start is greater than end"
 for_ start end fn = loop start
   where
-   loop !i | i == end  = return ()
+   loop !i | i > end  = return ()
 	   | otherwise = do fn i; loop (i+1)
 
 -- | Inclusive / Inclusive

diff --git a/haskell/par-collections/tests/Main.hs b/haskell/par-collections/tests/Main.hs
@@ -64,18 +64,21 @@ case_seqfoldMP = do
 -- Data.Atomic.Counter drops that time by a factor of three, i.e. 0.9s for 500M, or
 -- twice as slow as the fold versions above.
 --
---   (By the way, it's 2X slower to use the C.incrCounter atomic op than to use raw
+--   Side note 1: it's 2X slower to use the C.incrCounter atomic op than to use raw
 --   reads and writes, but some of this must be due to the fact that the
---   fetch-and-add primop is not an inline primop yet.)
+--   fetch-and-add primop is not an inline primop yet.
+--
+--   Side note 2: Using a custom vs. default implementatino of forM_ for Ranges
+--   didn't make a difference here.
 case_seqfor1 :: Assertion
 case_seqfor1 = do
   assertEqual "For loop over a range of ints" expectedSum =<<
     (timeit $ do
        cnt <- C.newCounter 0
        PC.forM_ (irange 1 size) $ \ i -> do 
---         x <- C.readCounter cnt
---         C.writeCounter cnt $! x + fromIntegral i
-         C.incrCounter (fromIntegral i) cnt 
+         x <- C.readCounter cnt
+         C.writeCounter cnt $! x + fromIntegral i
+--         C.incrCounter (fromIntegral i) cnt 
          return ()
        fmap fromIntegral $ C.readCounter cnt 
        -- cnt <- newIORef 0
@@ -86,11 +89,16 @@ case_seqfor1 = do
        -- readIORef cnt        
     )
 
--- Very slow currently [2013.12.07]: 5M in 0.37s, a full 100X worse.
-case_seqforMP :: Assertion
-case_seqforMP = do
+-- Very slow currently [2013.12.07]: 5M in 0.37s, a full 100X worse.  Providing a
+-- custom definition of forMP_ for Ranges (rather than the default) got this down to
+-- 0.13s for 5M, but that is still abysmal.
+--  
+--   Of course, if you perform the internalLiftIO once, outside the loop, the
+--   performance is the same as seqfor1.
+case_seqforMP1 :: Assertion
+case_seqforMP1 = do
   assertEqual "For loop over a range of ints in Par monad" expectedSum =<<
-    (timeit $ P.runParIO $ do
+    (timeit $ P.runParIO $ do 
        cnt <- internalLiftIO $ C.newCounter 0
        PC.forMP_ (irange 1 size) $ \ i -> do 
          x <- internalLiftIO$ C.readCounter cnt
@@ -99,6 +107,15 @@ case_seqforMP = do
        fmap fromIntegral $ internalLiftIO$ C.readCounter cnt 
     )
 
+-- Do no work in this version, but run the loop in the par monad.
+-- This one is reasonable speed, 0.29s for 500M.
+case_seqforMP2 :: Assertion
+case_seqforMP2 = do
+  assertEqual "For loop over a range of ints in Par monad" () =<<
+    (timeit $ P.runParIO $ do 
+       PC.forMP_ (irange 1 size) $ \ i -> 
+         return ()
+    )
 
 -- --------------------------------------------------------------------------------