[Bench] Fix arbitrary evaluation nonsense (IntersectMBO#5867)

This unscrews another portion of benchmarks. We still have benchmarks that are screwed up (see [PLT-6541](https://input-output.atlassian.net/browse/PLT-6541)) See [this](IntersectMBO#4914 (comment)) comment for an explanation of what went wrong.
v0d1ch · Dec 6, 2024 · a0afd2a · a0afd2a
1 parent 5dd488a
commit a0afd2a
Show file tree

Hide file tree

Showing 10 changed files with 128 additions and 92 deletions.
diff --git a/plutus-benchmark/README.md b/plutus-benchmark/README.md
@@ -1,5 +1,7 @@
 ## Plutus Benchmarks
 
+When implementing evaluation benchmarks, make sure to use 'mkEvalCtx' and 'evaluateCekForBench' to mimic the behavior of the ledger. If you use something else for evaluation, the results are likely not going to be representative of what actually happens in production.
+
 This directory contains four sets of benchmarks:
 
 * `nofib`: Plutus versions of some of Haskell's `nofib` benchmarks from https://github.com/ghc/nofib.

diff --git a/plutus-benchmark/cek-calibration/Main.hs b/plutus-benchmark/cek-calibration/Main.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns      #-}
 {-# LANGUAGE DataKinds         #-}
 {-# LANGUAGE NoImplicitPrelude #-}
 {-# LANGUAGE OverloadedStrings #-}
@@ -16,13 +17,13 @@ module Main (main) where
 
 import Prelude qualified as Haskell
 
+import PlutusBenchmark.Common (benchTermCek, mkEvalCtx)
 import PlutusCore
-import PlutusCore.Evaluation.Machine.ExBudgetingDefaults
 import PlutusCore.Pretty qualified as PP
+import PlutusLedgerApi.Common (EvaluationContext)
 import PlutusTx qualified as Tx
 import PlutusTx.Prelude as Tx
 import UntypedPlutusCore as UPLC
-import UntypedPlutusCore.Evaluation.Machine.Cek
 
 import Control.Exception
 import Control.Lens
@@ -32,13 +33,6 @@ import Criterion.Types qualified as C
 
 type PlainTerm = UPLC.Term Name DefaultUni DefaultFun ()
 
-
-benchCek :: UPLC.Term NamedDeBruijn DefaultUni DefaultFun () -> Benchmarkable
-benchCek t = case runExcept @UPLC.FreeVariableError $ runQuoteT $ UPLC.unDeBruijnTerm t of
-    Left e   -> throw e
-    Right t' -> whnf (unsafeEvaluateCekNoEmit defaultCekParameters) t'
-
-
 {-# INLINABLE rev #-}
 rev :: [()] -> [()]
 rev l0 = rev' l0 []
@@ -74,11 +68,11 @@ mkListTerm n =
   let (UPLC.Program _ _ code) = mkListProg n
   in code
 
-mkListBM :: Integer -> Benchmark
-mkListBM n = bench (Haskell.show n) $ benchCek (mkListTerm n)
+mkListBM :: EvaluationContext -> Integer -> Benchmark
+mkListBM ctx n = bench (Haskell.show n) $ benchTermCek ctx (mkListTerm n)
 
-mkListBMs :: [Integer] -> Benchmark
-mkListBMs ns = bgroup "List" [mkListBM n | n <- ns]
+mkListBMs :: EvaluationContext -> [Integer] -> Benchmark
+mkListBMs ctx ns = bgroup "List" [mkListBM ctx n | n <- ns]
 
 writePlc :: UPLC.Program NamedDeBruijn DefaultUni DefaultFun () -> Haskell.IO ()
 writePlc p =
@@ -91,10 +85,13 @@ writePlc p =
 
 
 main1 :: Haskell.IO ()
-main1 =
-  defaultMainWith (defaultConfig { C.csvFile = Just "cek-lists.csv" }) $ [mkListBMs [0,10..1000]]
+main1 = do
+  evalCtx <- evaluate mkEvalCtx
+  defaultMainWith
+      (defaultConfig { C.csvFile = Just "cek-lists.csv" })
+      [mkListBMs evalCtx [0,10..1000]]
 
-main2:: Haskell.IO ()
+main2 :: Haskell.IO ()
 main2 = writePlc (mkListProg 999)
 
 main :: Haskell.IO ()

diff --git a/plutus-benchmark/common/PlutusBenchmark/Common.hs b/plutus-benchmark/common/PlutusBenchmark/Common.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE LambdaCase   #-}
 {-# LANGUAGE ViewPatterns #-}
 
@@ -11,11 +12,14 @@ module PlutusBenchmark.Common
     , toNamedDeBruijnTerm
     , compiledCodeToTerm
     , haskellValueToTerm
-    , benchTermCek
     , benchProgramCek
     , unsafeRunTermCek
     , runTermCek
     , cekResultMatchesHaskellValue
+    , mkEvalCtx
+    , evaluateCekLikeInProd
+    , evaluateCekForBench
+    , benchTermCek
     , benchTermAgdaCek
     , benchProgramAgdaCek
     , TestSize (..)
@@ -29,17 +33,23 @@ where
 import Paths_plutus_benchmark as Export
 import PlutusBenchmark.ProtocolParameters as PP
 
+import PlutusLedgerApi.Common qualified as LedgerApi
+
+import PlutusTx qualified as Tx
+
 import PlutusCore qualified as PLC
 import PlutusCore.Default
 import PlutusCore.Evaluation.Machine.ExBudget (ExBudget (..))
 import PlutusCore.Evaluation.Machine.ExBudgetingDefaults qualified as PLC
 import PlutusCore.Evaluation.Machine.ExMemory (ExCPU (..), ExMemory (..))
-import PlutusTx qualified as Tx
+
 import UntypedPlutusCore qualified as UPLC
 import UntypedPlutusCore.Evaluation.Machine.Cek as Cek
+import UntypedPlutusCore.Evaluation.Machine.Cek qualified as UPLC
 
 import MAlonzo.Code.Evaluator.Term (runUAgda)
 
+import Control.DeepSeq (force)
 import Criterion.Main
 import Criterion.Types (Config (..))
 import Data.ByteString qualified as BS
@@ -105,12 +115,6 @@ haskellValueToTerm
     :: Tx.Lift DefaultUni a => a -> Term
 haskellValueToTerm = compiledCodeToTerm . Tx.liftCodeDef
 
-
-{- | Convert a de-Bruijn-named UPLC term to a CEK Benchmark -}
-benchTermCek :: Term -> Benchmarkable
-benchTermCek term =
-    nf unsafeRunTermCek $! term -- Or whnf?
-
 {- | Convert a de-Bruijn-named UPLC term to a CEK Benchmark -}
 benchProgramCek :: Program -> Benchmarkable
 benchProgramCek (UPLC.Program _ _ term) =
@@ -156,6 +160,47 @@ cekResultMatchesHaskellValue
 cekResultMatchesHaskellValue term matches value =
     (unsafeRunTermCek term) `matches` (unsafeRunTermCek $ haskellValueToTerm value)
 
+-- | Create the evaluation context for the benchmarks. This doesn't exactly match how it's done
+-- on-chain, but that's okay because the evaluation context is cached by the ledger, so we're
+-- deliberately not including it in the benchmarks.
+mkEvalCtx :: LedgerApi.EvaluationContext
+mkEvalCtx =
+    case PLC.defaultCostModelParams of
+        Just p ->
+            let errOrCtx =
+                    -- The validation benchmarks were all created from PlutusV1 scripts
+                    LedgerApi.mkDynEvaluationContext DefaultFunSemanticsVariant1 p
+            in case errOrCtx of
+                Right ec -> ec
+                Left err -> error $ show err
+        Nothing -> error "Couldn't get cost model params"
+
+-- | Evaluate a term as it would be evaluated using the on-chain evaluator.
+evaluateCekLikeInProd
+    :: LedgerApi.EvaluationContext
+    -> UPLC.Term PLC.NamedDeBruijn PLC.DefaultUni PLC.DefaultFun ()
+    -> Either
+            (UPLC.CekEvaluationException UPLC.NamedDeBruijn UPLC.DefaultUni UPLC.DefaultFun)
+            (UPLC.Term UPLC.NamedDeBruijn UPLC.DefaultUni UPLC.DefaultFun ())
+evaluateCekLikeInProd evalCtx term = do
+    let (getRes, _, _) =
+            let -- The validation benchmarks were all created from PlutusV1 scripts
+                pv = LedgerApi.ledgerLanguageIntroducedIn LedgerApi.PlutusV1
+            in LedgerApi.evaluateTerm UPLC.restrictingEnormous pv LedgerApi.Quiet evalCtx term
+    getRes
+
+-- | Evaluate a term and either throw if evaluation fails or discard the result and return '()'.
+-- Useful for benchmarking.
+evaluateCekForBench
+    :: LedgerApi.EvaluationContext
+    -> UPLC.Term PLC.NamedDeBruijn PLC.DefaultUni PLC.DefaultFun ()
+    -> ()
+evaluateCekForBench evalCtx = either (error . show) (\_ -> ()) . evaluateCekLikeInProd evalCtx
+
+benchTermCek :: LedgerApi.EvaluationContext -> Term -> Benchmarkable
+benchTermCek evalCtx term =
+    let !term' = force term
+    in whnf (evaluateCekForBench evalCtx) term'
 
 ---------------- Run a term or program using the plutus-metatheory CEK evaluator ----------------
 

diff --git a/plutus-benchmark/lists/bench/Bench.hs b/plutus-benchmark/lists/bench/Bench.hs
@@ -1,18 +1,23 @@
+{-# LANGUAGE BangPatterns #-}
+
 {- | Plutus benchmarks for some simple list algorithms.  See README.md for more information. -}
 
 module Main (main) where
 
 import Criterion.Main
 
-import PlutusBenchmark.Common (benchTermCek, getConfig)
-
+import PlutusBenchmark.Common (benchTermCek, getConfig, mkEvalCtx)
 import PlutusBenchmark.Lists.Sort qualified as Sort
-
 import PlutusBenchmark.Lists.Sum.Compiled qualified as Sum.Compiled
 import PlutusBenchmark.Lists.Sum.HandWritten qualified as Sum.HandWritten
+import PlutusLedgerApi.Common (EvaluationContext)
 
-benchmarks :: [Benchmark]
-benchmarks =
+import Control.DeepSeq
+import Control.Exception
+import Data.Functor
+
+benchmarks :: EvaluationContext -> [Benchmark]
+benchmarks ctx =
     [ bgroup "sort"
       [ mkBMsForSort "ghcSort"       Sort.mkWorstCaseGhcSortTerm
       , mkBMsForSort "insertionSort" Sort.mkWorstCaseInsertionSortTerm
@@ -38,15 +43,17 @@ benchmarks =
     ]
     where
       mkBMsForSort name f =
-        bgroup name $ map (\n -> bench (show n) . benchTermCek . f $ n) sizesForSort
+        bgroup name $ sizesForSort <&> \n ->
+          bench (show n) $ benchTermCek ctx (f n)
       sizesForSort = [10, 20..500]
       mkBMsForSum name f =
-        bgroup name $ map (\n -> bench (show n) . benchTermCek . f $ [1..n]) sizesForSum
+        bgroup name $ sizesForSum <&> \n ->
+          bench (show n) $ benchTermCek ctx (f [1..n])
       sizesForSum = [10, 50, 100, 500, 1000, 5000, 10000]
 
 main :: IO ()
 main = do
   -- Run each benchmark for at least 15 seconds.  Change this with -L or --timeout.
   config <- getConfig 15.0
-  defaultMainWith config benchmarks
-
+  evalCtx <- evaluate $ force mkEvalCtx
+  defaultMainWith config $ benchmarks evalCtx
diff --git a/plutus-benchmark/nofib/bench/BenchCek.hs b/plutus-benchmark/nofib/bench/BenchCek.hs
@@ -1,8 +1,14 @@
+{-# LANGUAGE BangPatterns #-}
+
 {- | Plutus benchmarks for the CEK machine based on some nofib examples. -}
 module Main where
 
-import PlutusBenchmark.Common (benchTermCek)
-import Shared (benchWith)
+import Shared (benchTermCek, benchWith, mkEvalCtx)
+
+import Control.DeepSeq (force)
+import Control.Exception (evaluate)
 
 main :: IO ()
-main = benchWith benchTermCek
+main = do
+  evalCtx <- evaluate $ force mkEvalCtx
+  benchWith $ benchTermCek evalCtx
diff --git a/plutus-benchmark/nofib/bench/Shared.hs b/plutus-benchmark/nofib/bench/Shared.hs
@@ -1,8 +1,12 @@
 {- | Shared code for benchmarking Plutus and Haskell versions of the Plutus nofib examples -}
-module Shared (benchWith, mkBenchMarks)
-where
-
-import PlutusBenchmark.Common (Term, getConfig)
+module Shared (
+    benchWith
+    , mkBenchMarks
+    , mkEvalCtx
+    , benchTermCek
+    ) where
+
+import PlutusBenchmark.Common (Term, benchTermCek, getConfig, mkEvalCtx)
 
 import PlutusBenchmark.NoFib.Clausify qualified as Clausify
 import PlutusBenchmark.NoFib.Knights qualified as Knights

diff --git a/plutus-benchmark/plutus-benchmark.cabal b/plutus-benchmark/plutus-benchmark.cabal
@@ -75,10 +75,12 @@ library plutus-benchmark-common
     , base               >=4.9   && <5
     , bytestring
     , criterion
+    , deepseq
     , directory
     , filepath
     , flat               ^>=0.6
     , plutus-core        ^>=1.25
+    , plutus-ledger-api  ^>=1.25
     , plutus-metatheory
     , plutus-tx          ^>=1.25
     , tasty
@@ -141,6 +143,7 @@ benchmark nofib
   build-depends:
     , base                     >=4.9     && <5
     , criterion                >=1.5.9.0
+    , deepseq
     , nofib-internal
     , plutus-benchmark-common
 
@@ -220,8 +223,10 @@ benchmark lists
   build-depends:
     , base                     >=4.9     && <5
     , criterion                >=1.5.9.0
+    , deepseq
     , lists-internal
     , plutus-benchmark-common
+    , plutus-ledger-api
 
 test-suite plutus-benchmark-lists-tests
   import:         lang, ghc-version-support
@@ -262,7 +267,6 @@ benchmark validation
     , optparse-applicative
     , plutus-benchmark-common
     , plutus-core              ^>=1.25
-    , plutus-ledger-api        ^>=1.25
 
 ---------------- validation-decode ----------------
 
@@ -319,13 +323,16 @@ benchmark cek-calibration
   main-is:          Main.hs
   hs-source-dirs:   cek-calibration
   build-depends:
-    , base              >=4.9     && <5
-    , criterion         >=1.5.9.0
+    , base                     >=4.9     && <5
+    , criterion                >=1.5.9.0
+    , deepseq
     , lens
     , mtl
-    , plutus-core       ^>=1.25
-    , plutus-tx         ^>=1.25
-    , plutus-tx-plugin  ^>=1.25
+    , plutus-benchmark-common
+    , plutus-core              ^>=1.25
+    , plutus-ledger-api        ^>=1.25
+    , plutus-tx                ^>=1.25
+    , plutus-tx-plugin         ^>=1.25
 
 ---------------- Signature verification throughput ----------------
 
@@ -567,7 +574,6 @@ benchmark validation-agda-cek
     , optparse-applicative
     , plutus-benchmark-common
     , plutus-core              ^>=1.25
-    , plutus-ledger-api        ^>=1.25
 
 benchmark nofib-agda-cek
   import:         lang, ghc-version-support

diff --git a/plutus-benchmark/validation/bench/BenchCek.hs b/plutus-benchmark/validation/bench/BenchCek.hs
@@ -1,16 +1,12 @@
 {- | Validation benchmarks for the CEK machine. -}
-
-{-# LANGUAGE BangPatterns #-}
 module Main where
 
-import Common (benchWith, evaluateCekLikeInProd, mkEvalCtx, unsafeUnflat)
+import Common (benchTermCek, benchWith, mkEvalCtx, unsafeUnflat)
 import Control.DeepSeq (force)
 import Control.Exception (evaluate)
 import PlutusBenchmark.Common (toNamedDeBruijnTerm)
 import UntypedPlutusCore as UPLC
 
-import Criterion (whnf)
-
 {-|
  Benchmarks only for the CEK execution time of the data/*.flat validation scripts
 
@@ -23,9 +19,5 @@ main :: IO ()
 main = do
   evalCtx <- evaluate $ force mkEvalCtx
   let mkCekBM file program =
-          -- don't count the undebruijn . unflat cost
-          -- `force` to try to ensure that deserialiation is not included in benchmarking time.
-          let !benchTerm = force . toNamedDeBruijnTerm . UPLC._progTerm $ unsafeUnflat file program
-              eval = either (error . show) (\_ -> ()) . evaluateCekLikeInProd evalCtx
-          in whnf eval benchTerm
+          benchTermCek evalCtx . toNamedDeBruijnTerm . UPLC._progTerm $ unsafeUnflat file program
   benchWith mkCekBM