diff --git a/plutus-core/changelog.d/20230809_093202_koz.ross_bitwise.md b/plutus-core/changelog.d/20230809_093202_koz.ross_bitwise.md
new file mode 100644
index 00000000000..2452d730960
--- /dev/null
+++ b/plutus-core/changelog.d/20230809_093202_koz.ross_bitwise.md
@@ -0,0 +1,41 @@
+<!--
+A new scriv changelog fragment.
+
+Uncomment the section that is right (remove the HTML comment wrapper).
+-->
+
+<!--
+### Removed
+
+- A bullet item for the Removed category.
+
+-->
+### Added
+
+- Bitwise primitive operations, according to
+  [CIP-0058](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0058).
+
+<!--
+### Changed
+
+- A bullet item for the Changed category.
+
+-->
+<!--
+### Deprecated
+
+- A bullet item for the Deprecated category.
+
+-->
+<!--
+### Fixed
+
+- A bullet item for the Fixed category.
+
+-->
+<!--
+### Security
+
+- A bullet item for the Security category.
+
+-->
diff --git a/plutus-core/cost-model/create-cost-model/CreateBuiltinCostModel.hs b/plutus-core/cost-model/create-cost-model/CreateBuiltinCostModel.hs
index f0fdd26dd43..43a08a00213 100644
--- a/plutus-core/cost-model/create-cost-model/CreateBuiltinCostModel.hs
+++ b/plutus-core/cost-model/create-cost-model/CreateBuiltinCostModel.hs
@@ -123,6 +123,18 @@ builtinCostModelNames = BuiltinCostModelBase
   , paramBls12_381_finalVerify           = "bls12_381_finalVerifyModel"
   , paramBlake2b_224                     = "blake2b_224Model"
   , paramKeccak_256                      = "keccak_256Model"
+  , paramIntegerToByteString             = "integerToByteStringModel"
+  , paramByteStringToInteger             = "byteStringToIntegerModel"
+  , paramAndByteString                   = "andByteStringModel"
+  , paramIorByteString                   = "iorByteStringModel"
+  , paramXorByteString                   = "xorByteStringModel"
+  , paramComplementByteString            = "complementByteStringModel"
+  , paramShiftByteString                 = "shiftByteStringModel"
+  , paramRotateByteString                = "rotateByteStringModel"
+  , paramPopCountByteString              = "popCountByteStringModel"
+  , paramTestBitByteString               = "testBitByteStringModel"
+  , paramWriteBitByteString              = "writeBitByteStringModel"
+  , paramFindFirstSetByteString          = "findFirstSetByteStringModel"
   }
 
 
@@ -233,6 +245,19 @@ createBuiltinCostModel bmfile rfile = do
     paramBls12_381_millerLoop            <- getParams bls12_381_millerLoop      paramBls12_381_millerLoop
     paramBls12_381_mulMlResult           <- getParams bls12_381_mulMlResult     paramBls12_381_mulMlResult
     paramBls12_381_finalVerify           <- getParams bls12_381_finalVerify     paramBls12_381_finalVerify
+    -- Bitwise operations
+    paramIntegerToByteString             <- getParams integerToByteString       paramIntegerToByteString
+    paramByteStringToInteger             <- getParams byteStringToInteger       paramByteStringToInteger
+    paramAndByteString                   <- getParams andByteString             paramAndByteString
+    paramIorByteString                   <- getParams iorByteString             paramIorByteString
+    paramXorByteString                   <- getParams xorByteString             paramXorByteString
+    paramComplementByteString            <- getParams complementByteString      paramComplementByteString
+    paramShiftByteString                 <- getParams shiftByteString           paramShiftByteString
+    paramRotateByteString                <- getParams rotateByteString          paramRotateByteString
+    paramPopCountByteString              <- getParams popCountByteString        paramPopCountByteString
+    paramTestBitByteString               <- getParams testBitByteString         paramTestBitByteString
+    paramWriteBitByteString              <- getParams writeBitByteString        paramWriteBitByteString
+    paramFindFirstSetByteString          <- getParams findFirstSetByteString    paramFindFirstSetByteString
 
     paramKeccak_256                      <- getParams keccak_256                paramKeccak_256
     paramBlake2b_224                     <- getParams blake2b_224               paramBlake2b_224
@@ -922,9 +947,80 @@ bls12_381_mulMlResult cpuModelR = do
     pure $ CostingFun cpuModel memModel
 
 bls12_381_finalVerify :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
-bls12_381_finalVerify cpuModelR= do
+bls12_381_finalVerify cpuModelR = do
     cpuModel <- ModelTwoArgumentsConstantCost <$> readModelConstantCost cpuModelR
     let memModel = boolMemModel
     pure $ CostingFun cpuModel memModel
 
+integerToByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelOneArgument)
+integerToByteString cpuModelR = do
+  cpuModel <- ModelOneArgumentLinearCost <$> readModelLinearInX cpuModelR
+  let memModel = ModelOneArgumentLinearCost $ ModelLinearSize 0 1
+  pure $ CostingFun cpuModel memModel
+
+byteStringToInteger :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelOneArgument)
+byteStringToInteger cpuModelR = do
+  cpuModel <- ModelOneArgumentLinearCost <$> readModelLinearInX cpuModelR
+  let memModel = ModelOneArgumentLinearCost $ ModelLinearSize 0 1
+  pure $ CostingFun cpuModel memModel
+
+andByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
+andByteString cpuModelR = do
+  cpuModel <-  ModelTwoArgumentsMaxSize <$> readModelMaxSize cpuModelR
+  let memModel = ModelTwoArgumentsMaxSize $ ModelMaxSize 0 1
+  pure $ CostingFun cpuModel memModel
+
+iorByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
+iorByteString cpuModelR = do
+  cpuModel <-  ModelTwoArgumentsMaxSize <$> readModelMaxSize cpuModelR
+  let memModel = ModelTwoArgumentsMaxSize $ ModelMaxSize 0 1
+  pure $ CostingFun cpuModel memModel
+
+xorByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
+xorByteString cpuModelR = do
+  cpuModel <-  ModelTwoArgumentsMaxSize <$> readModelMaxSize cpuModelR
+  let memModel = ModelTwoArgumentsMaxSize $ ModelMaxSize 0 1
+  pure $ CostingFun cpuModel memModel
+
+complementByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelOneArgument)
+complementByteString cpuModelR = do
+  cpuModel <- ModelOneArgumentLinearCost <$> readModelLinearInX cpuModelR
+  let memModel = ModelOneArgumentLinearCost $ ModelLinearSize 0 1
+  pure $ CostingFun cpuModel memModel
+
+shiftByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
+shiftByteString cpuModelR = do
+  cpuModel <- undefined
+  let memModel = undefined
+  pure $ CostingFun cpuModel memModel
+
+rotateByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
+rotateByteString cpuModelR = do
+  cpuModel <- undefined  -- FIXME
+  let memModel = undefined -- FIXME
+  pure $ CostingFun cpuModel memModel
+
+popCountByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelOneArgument)
+popCountByteString cpuModelR = do
+  cpuModel <- ModelOneArgumentLinearCost <$> readModelLinearInX cpuModelR
+  let memModel = ModelOneArgumentLinearCost $ ModelLinearSize 0 1  -- FIXME
+  pure $ CostingFun cpuModel memModel
+
+testBitByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelTwoArguments)
+testBitByteString cpuModelR = do
+  cpuModel <- undefined -- FIXME
+  let memModel = ModelTwoArgumentsConstantCost 1
+  pure $ CostingFun cpuModel memModel
+
+writeBitByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelThreeArguments)
+writeBitByteString cpuModelR = do
+  cpuModel <- undefined -- FIXME
+  let memModel = undefined -- FIXME
+  pure $ CostingFun cpuModel memModel
+
+findFirstSetByteString :: MonadR m => (SomeSEXP (Region m)) -> m (CostingFun ModelOneArgument)
+findFirstSetByteString cpuModelR = do
+  cpuModel <- ModelOneArgumentLinearCost <$> readModelLinearInX cpuModelR
+  let memModel = undefined -- FIXME
+  pure $ CostingFun cpuModel memModel
 
diff --git a/plutus-core/cost-model/data/builtinCostModel.json b/plutus-core/cost-model/data/builtinCostModel.json
index e1fd262c885..8abf48f5249 100644
--- a/plutus-core/cost-model/data/builtinCostModel.json
+++ b/plutus-core/cost-model/data/builtinCostModel.json
@@ -899,5 +899,197 @@
             "arguments": 10,
             "type": "constant_cost"
         }
+    },
+    "integerToByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_cost"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 0,
+                "slope": 1
+            },
+            "type": "linear_cost"
+        }
+    },
+    "byteStringToInteger": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_cost"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 1,
+                "slope": 1
+            },
+            "type": "linear_cost"
+        }
+    },        
+    "andByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "max_size"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 1,
+                "slope": 1
+            },
+            "type": "max_size"
+        }
+    },
+    "iorByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "max_size"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 1,
+                "slope": 1
+            },
+            "type": "max_size"
+        }
+    },
+    "xorByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "max_size"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 1,
+                "slope": 1
+            },
+            "type": "max_size"
+        }
+    },
+    "complementByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_cost"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 0,
+                "slope": 1
+            },
+            "type": "linear_cost"
+        }
+    },
+    "shiftByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "max_size"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 1,
+                "slope": 1
+            },
+            "type": "max_size"
+        }
+    },
+    "rotateByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "max_size"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 1,
+                "slope": 1
+            },
+            "type": "max_size"
+        }
+    },
+    "popCountByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_cost"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 0,
+                "slope": 1
+            },
+            "type": "linear_cost"
+        }
+    },
+    "testBitByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_in_x"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 0,
+                "slope": 1
+            },
+            "type": "linear_in_x"
+        }
+    },
+    "writeBitByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_in_x"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 0,
+                "slope": 1
+            },
+            "type": "linear_in_x"
+        }
+    },
+    "findFirstSetByteString": {
+        "cpu": {
+            "arguments": {
+                "intercept": 9999999999,
+                "slope": 9999999999
+            },
+            "type": "linear_cost"
+        },
+        "memory": {
+            "arguments": {
+                "intercept": 0,
+                "slope": 1
+            },
+            "type": "linear_cost"
+        }
     }
 }
diff --git a/plutus-core/cost-model/data/models.R b/plutus-core/cost-model/data/models.R
index cd77bb0bc87..60db2d3ab22 100644
--- a/plutus-core/cost-model/data/models.R
+++ b/plutus-core/cost-model/data/models.R
@@ -651,6 +651,20 @@ modelFun <- function(path) {
     bls12_381_mulMlResultModel       <- constantModel ("Bls12_381_mulMlResult")
     bls12_381_finalVerifyModel       <- constantModel ("Bls12_381_finalVerify")
 
+    ##### Bitwise operations #####
+    integerToByteStringModel    <- NULL   ### FIXME
+    byteStringToIntegerModel    <- NULL
+    andByteStringModel          <- NULL
+    iorByteStringModel          <- NULL
+    xorByteStringModel          <- NULL
+    complementByteStringModel   <- NULL
+    shiftByteStringModel        <- NULL
+    rotateByteStringModel       <- NULL
+    popCountByteStringModel     <- NULL
+    testBitByteStringModel      <- NULL
+    writeBitByteStringModel     <- NULL
+    findFirstSetByteStringModel <- NULL
+
     list(
         addIntegerModel                      = addIntegerModel,
         subtractIntegerModel                 = subtractIntegerModel,
@@ -724,6 +738,18 @@ modelFun <- function(path) {
         bls12_381_G2_uncompressModel         = bls12_381_G2_uncompressModel,
         bls12_381_millerLoopModel            = bls12_381_millerLoopModel,
         bls12_381_mulMlResultModel           = bls12_381_mulMlResultModel,
-        bls12_381_finalVerifyModel           = bls12_381_finalVerifyModel
+        bls12_381_finalVerifyModel           = bls12_381_finalVerifyModel,
+        integerToByteStringModel             = integerToByteStringModel,
+        byteStringToIntegerModel             = byteStringToIntegerModel,
+        andByteStringModel                   = andByteStringModel,
+        iorByteStringModel                   = iorByteStringModel,
+        xorByteStringModel                   = xorByteStringModel,
+        complementByteStringModel            = complementByteStringModel,
+        shiftByteStringModel                 = shiftByteStringModel,
+        rotateByteStringModel                = rotateByteStringModel,
+        popCountByteStringModel              = popCountByteStringModel,
+        testBitByteStringModel               = testBitByteStringModel,
+        writeBitByteStringModel              = writeBitByteStringModel,
+        findFirstSetByteStringModel          = findFirstSetByteStringModel
     )
-}
+}
\ No newline at end of file
diff --git a/plutus-core/cost-model/test/TestCostModels.hs b/plutus-core/cost-model/test/TestCostModels.hs
index 2e7546b25a8..b0660327929 100644
--- a/plutus-core/cost-model/test/TestCostModels.hs
+++ b/plutus-core/cost-model/test/TestCostModels.hs
@@ -419,9 +419,22 @@ main =
                     , $(genTest 2 "bls12_381_millerLoop")     Everywhere
                     , $(genTest 2 "bls12_381_mulMlResult")    Everywhere
                     , $(genTest 2 "bls12_381_finalVerify")    Everywhere
-
                     -- Keccak_256, Blake2b_224
                     , $(genTest 1 "keccak_256")
                     , $(genTest 1 "blake2b_224")
-                   ]
+                    -- Bitwise operations
+                    , $(genTest 1 "integerToByteString")
+                    , $(genTest 1 "byteStringToInteger")
+                    , $(genTest 2 "andByteString")            Everywhere
+                    , $(genTest 2 "iorByteString")            Everywhere
+                    , $(genTest 2 "xorByteString")            Everywhere
+                    , $(genTest 1 "complementByteString")
+                    , $(genTest 2 "shiftByteString")          Everywhere
+                    , $(genTest 2 "rotateByteString")         Everywhere
+                    , $(genTest 1 "popCountByteString")
+                    , $(genTest 2 "testBitByteString")        Everywhere
+                    , $(genTest 3 "writeBitByteString")
+                    , $(genTest 1 "findFirstSetByteString")
+                    ]
+
 
diff --git a/plutus-core/plutus-core.cabal b/plutus-core/plutus-core.cabal
index 94624f30a72..6d367bca1cc 100644
--- a/plutus-core/plutus-core.cabal
+++ b/plutus-core/plutus-core.cabal
@@ -71,6 +71,7 @@ common lang
 library
   import:             lang
   exposed-modules:
+    Bitwise
     Data.Either.Extras
     Data.List.Extras
     Data.MultiSet.Lens
@@ -281,6 +282,7 @@ library
     , filepath
     , flat                        ^>=0.6
     , free
+    , ghc-bignum                  ^>=1.3
     , ghc-prim
     , hashable                    >=1.4
     , hedgehog                    >=1.0
@@ -383,6 +385,7 @@ test-suite untyped-plutus-core-test
     DeBruijn.Spec
     DeBruijn.UnDeBruijnify
     Evaluation.Builtins
+    Evaluation.Builtins.Bitwise
     Evaluation.Builtins.BLS12_381
     Evaluation.Builtins.BLS12_381.TestClasses
     Evaluation.Builtins.BLS12_381.Utils
@@ -992,3 +995,46 @@ test-suite index-envs-test
     , quickcheck-instances
     , tasty
     , tasty-quickcheck
+
+benchmark bitwise
+  import:           lang
+  type:             exitcode-stdio-1.0
+  hs-source-dirs:   plutus-core/bench/bitwise
+  default-language: Haskell2010
+  main-is:          Main.hs
+  other-modules:
+    Benches.Binary
+    Benches.BitRead
+    Benches.BitWrite
+    Benches.Complement
+    Benches.Convert
+    Benches.CountLeadingZeroes
+    Benches.Popcount
+    Benches.Rotate
+    Benches.Shift
+    DataGen
+
+  include-dirs:     plutus-core/bench/bitwise/cbits
+  c-sources:
+    plutus-core/bench/bitwise/cbits/binary.c
+    plutus-core/bench/bitwise/cbits/bit-access.c
+    plutus-core/bench/bitwise/cbits/clz.c
+    plutus-core/bench/bitwise/cbits/complement.c
+    plutus-core/bench/bitwise/cbits/popcount.c
+    plutus-core/bench/bitwise/cbits/rotate.c
+    plutus-core/bench/bitwise/cbits/shift.c
+
+  cc-options:       -O3
+
+  if arch(x86_64)
+    cc-options: -mpopcnt -mabm
+
+  build-depends:
+    , base
+    , bytestring
+    , mtl
+    , random
+    , tasty
+    , tasty-bench
+
+  ghc-options:      -O2 -rtsopts "-with-rtsopts=-A32m --nonmoving-gc -T"
diff --git a/plutus-core/plutus-core/bench/bitwise/BENCHES.md b/plutus-core/plutus-core/bench/bitwise/BENCHES.md
new file mode 100644
index 00000000000..0a1dcc5ce1e
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/BENCHES.md
@@ -0,0 +1,900 @@
+# Benchmarks of bitwise operations for Plutus core
+
+## Introduction
+
+## Conventions
+
+We use _kilobyte_ (_KiB_ for short) to mean $2^{10}$ (1024) bytes.
+
+We assume versions of Haskell libraries as defined by Plutus and determined by
+`cabal new-freeze`. In particular, the following library versions are assumed to
+be in use:
+
+* `bytestring-0.10.12.0`
+* `wide-word-0.1.1.2`
+
+An operation is _bit-parallel_ if it treats its inputs as vectors of bits and
+assumes no other structure. For example [bitwise logical
+AND](https://en.wikipedia.org/wiki/Bitwise_operation#AND) is bit-parallel, while
+a [bitwise
+rotation](https://en.wikipedia.org/wiki/Bitwise_operation#Circular_shift) isn't.
+This extends the definition of right-to-left computability defined in [Hacker's
+Delight](https://en.wikipedia.org/wiki/Hacker%27s_Delight), but is stricter.
+
+We use _population count_ to mean the number of 1 bits. We use this term with
+individual bytes, words, or sequences of either.
+
+## Background
+
+Plutus Core, which is designed to be executed on-chain, has unusual limitations
+regarding performance. Specifically, the size of its possible arguments is
+_significantly_ bounded: according to
+[CIP-0009](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0009), the
+current on-chain transaction size limit is 16384 bytes (16KiB), which includes
+both script sources and arguments. While this limit could rise in the future,
+currently, it means that `ByteString` arguments larger than about 2KiB aren't
+practical. This implies that:
+
+* Asymptotically-optimal solutions may not be worthwhile if their 'ramp-up'
+  requires inputs approaching, or larger than, 2KiB; and
+* Small inputs must be considered the norm.
+
+### Spatial and temporal locality
+
+## Implementation
+
+### Bitwise binary operations
+
+The naive approach (and our first) uses a combination of `fromListN` and `zip` 
+together with the operator in question; in essence, this streams both arguments 
+into a list of `Word8`s, combines them with the given operator, then 'repacks' 
+into a `ByteString`. This approach, while easy to implement, is likely to be 
+fairly slow once the size of its arguments becomes even moderately large. We 
+enumerate the reasons for this hypothesis below; many of these are based on the 
+reasoning described in [the spatial and temporal locality](#spatial-and-temporal-locality) 
+section.
+
+* The `ByteString` library 
+  [does not use the size hint](https://hackage.haskell.org/package/bytestring-0.10.12.0/docs/src/Data.ByteString.Internal.html#line-194) 
+  given to `fromListN`. This requires a traversal of the input list, which
+  forces it, making any stream fusion from GHC ineffective.
+* Haskell linked lists lack spatial locality, and due to the above, we have to
+  force spatially local data into a spatially non-local form, only to then
+  immediately convert it _back_ to a spatially local form again. This not only
+  requires copying the data, it also means that we lose the benefits of spatial
+  locality for no reason.
+* Haskell linked lists full of `Word8` lack temporal locality: even though a
+  modern machine can fit eight `Word8`s into a single register, if the data is
+  in a list, this cannot be done.
+
+These concerns were seen as sufficient to warrant the introduction of
+`packZipWith`, which [avoids creating an intermediate list](https://github.com/haskell/bytestring/pull/295). 
+This operation amounts to creating an empty `ByteString` of the right length,
+then performing the 'zipping' in-place. This avoids the problems of spatial
+locality, and in theory could also avoid problems of temporal locality if the
+operation being performed is bit-parallel. However, in general, 'zipping'
+operations on `Word8`s cannot be assumed to be bit-parallel, which requires a
+more conservative, 'byte-at-a-time' approach. Furthermore, `packZipWith` only
+became available in the `bytestring` library as of [release
+0.11.1.0](https://hackage.haskell.org/package/bytestring-0.11.1.0/changelog).
+Thus, we replicate it for our second approach.
+
+All of our bitwise binary operations are bit-parallel by their definition; this
+allows an implementation where we exploit temporal locality to process eight
+bytes at a time. This essentially mirrors what the second approach does, but
+instead first takes a maximal number of 'big steps' (eight bytes at a time),
+followed by any remaining inputs being processed one byte at a time. We take
+this as our third approach. We also attempt a fourth approach, where we take
+even larger steps, using the `Word256` type from `wide-word`: this amounts to a
+four-way [loop unroll](https://en.wikipedia.org/wiki/Loop_unrolling), as GHC
+cannot currently generate SIMD instructions, even for bit-parallel operations.
+This can, in theory, still be beneficial, due to
+[ILP](https://en.wikipedia.org/wiki/Instruction-level_parallelism) being
+available on most modern CPUs. More specifically, the third approach works as
+follows:
+
+1. Allocate an empty `ByteString` of the correct length.
+2. While at least eight bytes of both inputs are remaining, perform the bitwise
+   operation on an eight-byte chunk of both inputs, then write the result to the
+   corresponding part of the empty `ByteString` from 1.
+3. For the remaining bytes, perform the bitwise operation on each of the
+   corresponding bytes of the input, writing them to the empty `ByteString` from
+   1.
+
+The fourth approach works as follows:
+
+1. Allocate an empty `ByteString` of the correct length.
+2. While at least 32 bytes of both inputs are remaining, perform the bitwise
+   operation on a 32-byte chunk of both inputs, then write the result to the
+   corresponding part of the empty `ByteString` from 1.
+3. While at least eight bytes of both inputs are remaining, perform the bitwise
+   operation on an eight-byte chunk of both inputs, then write the result to the
+   corresponding part of the empty `ByteString` from 1.
+4. For the remaining bytes, perform the bitwise operation on each of the
+   corresponding bytes of the input, writing them to the empty `ByteString` from
+   1.
+
+### Bitwise complement
+
+A naive (and our first) approach would use `map` from `bytestring`: this is
+implemented as a [loop over a preconstructed empty
+`bytestring`](https://hackage.haskell.org/package/bytestring-0.10.12.0/docs/src/Data.ByteString.html#map),
+and thus has good spatial locality. In theory, if given a bit-parallel
+operation, it could make use of temporal locality by operating on larger widths
+(such as whole machine words), but as `Word8` operations cannot be assumed to be
+bit-parallel in general, it must work 'byte-at-a-time'. In our case, bitwise
+complement _is_ bit-parallel, so our second approach attempts to make use of
+this fact, essentially doing the following:
+
+1. Allocate an empty `ByteString` the same length as the input.
+2. While at least eight bytes of the input remains, determine the bitwise
+   complement of an eight-byte chunk of the input, then write it to the
+   corresponding part of the empty `ByteString` from 1.
+3. For the remaining bytes, perfor the bitwise complement on each of the
+   corresponding bytes of the input, writing them to the empty `ByteString` from
+   1.
+
+We also define a third approach which takes even larger steps, using the
+`Word256` type from `wide-word`. This amounts to a four-way loop unroll, as GHC
+cannot currently generate SIMD instructions, even for bit-parallel operations.
+This can, in theory, still be beneficial, due to ILP being available on most
+modern CPUs. More specifically, the third approach works as follows:
+
+1. Allocate an empty `ByteString` the same length as the input.
+2. While at least 32 bytes of of the input remains, determine the bitwise
+   complement of a 32-byte chunk of the input, then write it to the
+   corresponding part of the empty `ByteString` from 1.
+3. While at least eight bytes of the input remains, determine the bitwise
+   complement of an eight-byte chunk of the input, then write it to the
+   corresponding part of the empty `ByteString` from 1.
+4. For the remaining bytes, perfor the bitwise complement on each of the
+   corresponding bytes of the input, writing them to the empty `ByteString` from
+   1.
+
+### Population count
+
+A naive approach would involve a fold over the `Word8`s in the argument, summing
+the result of `popCount`. This forms our first approach, using the `foldl'`
+function provided by `bytestring`. This approach makes good use of spatial
+locality, but not particularly good use of temporal locality: each `Word8` we
+load into a register to population count still requires a memory transfer, but
+we only population count 8 bits, rather than the 64 bits that could fit into the
+register. Moreover, x86_64 platforms have efficient instructions dedicated to
+population counting, which can easily count a whole register's worth of bits.
+Thu, our second approach makes use of this capability by doing two 'phases' of
+counting: firstly, we count eight-byte chunks, then finish what remains one byte
+at a time. Specifically, we do the following:
+
+1. Initialize a counter to 0.
+2. While at least eight bytes of the input remains, population count an
+   eight-byte chunk, then add the result to the counter.
+3. While any bytes of the input remain, population count one byte, then add the
+   result to the counter.
+4. Return the counter.
+
+We also define a third approach which takes even larger chunks, using the
+`Word256` type from `wide-word`. This amounts to a four-way loop unroll, as
+there are no specialized instructions for population counting chunks larger than
+eight bytes on any current architectures supported by GHC. This can, in theory,
+still be beneficial, due to ILP being available on most modern CPUs. More
+specifically, the third approach works as follows:
+
+1. Initialize a counter to 0.
+2. While at least 32 bytes of input remains, population count a 32-byte chunk,
+   then add the result to the counter.
+3. While at least eight bytes of the input remains, population count an
+   eight-byte chunk, then add the result to the counter.
+4. While any bytes of the input remain, population count one byte, then add the
+   result to the counter.
+5. Return the counter.
+
+## Methodology
+
+### Bitwise binary operations
+
+We benchmark only bitwise AND, as the other operations do not differ
+structurally, are also bit-parallel, and given a fixed width of inputs, are
+constant time relative that width. We implement all four approaches; for the
+third and fourth approaches, we implement them both in Haskell and in C, which
+is called via the FFI. This allows us to see if any overheads are being
+introduced by GHC.
+
+We use pairs of inputs of the following lengths:
+
+* 1
+* 3
+* 7
+* 15
+* 31
+* 63
+* 127
+* 255
+* 511
+* 1023
+* 2047
+
+We choose these values to disadvantage the third and fourth approaches as much
+as possible, as values that are one less than a power of 2 in length would
+require them to do the most work in their last step.
+
+We compare all approaches to the first (that is, naive) one.
+
+### Bitwise complement
+
+We implement all three approaches; for the second approach, we implement it both
+in Haskell and in C, which is called via the FFI. This is to determine whether
+any overheads are being introduced by GHC.
+
+We use inputs of the following lengths:
+
+* 1
+* 3
+* 7
+* 15
+* 31
+* 63
+* 127
+* 255
+* 511
+* 1023
+* 2047
+
+We choose these values to disadvantage the second and third approaches as much
+as possible, as values that are one less than a power of 2 in length would
+require them to do the most work in their last step.
+
+We compare all approaches to the first (that is, naive) one.
+
+### Population count
+
+We implement all three approaches. We use inputs of the following lengths:
+
+* 1
+* 3
+* 7
+* 15
+* 31
+* 63
+* 127
+* 255
+* 511
+* 1023
+* 2047
+
+We choose these values to disadvantage the second and third approaches as much
+as possible, as values that are one less than a power of 2 in length would
+require them to do the most work in their second-to-last step.
+
+## Results
+
+Throughout, we run benchmarks (implemented with `tasty-bench`) with `--stdev=1
+--timeout=200` to ensure minimal interference and accurate readings, while
+avoiding timeouts due to the increased time required to get accurate readings.
+
+### Bitwise binary operations
+
+The results of our benchmarks given the methodology we described are below.
+Throughout, `zipWith` refers to the first approach, `packZipWith` to the second
+approach, `chunkedZipWith (2 blocks)` to the third approach, and `chunkedZipWith
+(3 blocks)` to the fourth approach. We also mark the C implementations of the
+third and fourth approach. All multipliers are shows as multiples of the running
+time of the first approach on the same length of data.
+
+```
+All
+  Bitwise AND
+    Bitwise AND, length 1
+      zipWith:                      OK (0.86s)
+        46.2 ns ± 864 ps, 254 B  allocated,   0 B  copied,  45 MB peak memory
+      packZipWith:                  OK (5.96s)
+        177  ns ± 2.3 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 3.83x
+      chunkedZipWith (2 blocks):    OK (6.00s)
+        178  ns ± 1.0 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 3.85x
+      chunkedZipWith (2 blocks, C): OK (12.14s)
+        180  ns ± 904 ps, 359 B  allocated,  95 B  copied,  52 MB peak memory, 3.89x
+      chunkedZipWith (3 blocks):    OK (3.15s)
+        184  ns ± 2.1 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 3.99x
+      chunkedZipWith (3 blocks, C): OK (0.78s)
+        180  ns ± 3.5 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 3.90x
+    Bitwise AND, length 3
+      zipWith:                      OK (1.85s)
+        54.3 ns ± 530 ps, 350 B  allocated,   0 B  copied,  52 MB peak memory
+      packZipWith:                  OK (0.78s)
+        181  ns ± 3.2 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 3.33x
+      chunkedZipWith (2 blocks):    OK (6.28s)
+        186  ns ± 1.3 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 3.43x
+      chunkedZipWith (2 blocks, C): OK (0.79s)
+        184  ns ± 3.5 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 3.39x
+      chunkedZipWith (3 blocks):    OK (6.39s)
+        189  ns ± 3.7 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 3.47x
+      chunkedZipWith (3 blocks, C): OK (100.97s)
+        187  ns ± 3.1 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 3.45x
+    Bitwise AND, length 7
+      zipWith:                      OK (1.18s)
+        68.2 ns ± 1.0 ns, 508 B  allocated,   0 B  copied,  52 MB peak memory
+      packZipWith:                  OK (0.78s)
+        182  ns ± 3.2 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 2.67x
+      chunkedZipWith (2 blocks):    OK (3.25s)
+        191  ns ± 942 ps, 358 B  allocated,  95 B  copied,  52 MB peak memory, 2.80x
+      chunkedZipWith (2 blocks, C): OK (3.17s)
+        188  ns ± 1.2 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 2.75x
+      chunkedZipWith (3 blocks):    OK (3.27s)
+        194  ns ± 966 ps, 358 B  allocated,  95 B  copied,  52 MB peak memory, 2.85x
+      chunkedZipWith (3 blocks, C): OK (3.28s)
+        193  ns ± 2.4 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 2.83x
+    Bitwise AND, length 15
+      zipWith:                      OK (0.91s)
+        106  ns ± 1.7 ns, 844 B  allocated,   0 B  copied,  52 MB peak memory
+      packZipWith:                  OK (3.23s)
+        191  ns ± 2.7 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 1.80x
+      chunkedZipWith (2 blocks):    OK (1.68s)
+        198  ns ± 3.4 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 1.87x
+      chunkedZipWith (2 blocks, C): OK (1.69s)
+        196  ns ± 2.6 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 1.85x
+      chunkedZipWith (3 blocks):    OK (13.48s)
+        201  ns ± 1.2 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 1.90x
+      chunkedZipWith (3 blocks, C): OK (3.40s)
+        200  ns ± 1.2 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 1.88x
+    Bitwise AND, length 31
+      zipWith:                      OK (1.69s)
+        199  ns ± 1.9 ns, 1.5 KB allocated,   0 B  copied,  52 MB peak memory
+      packZipWith:                  OK (6.10s)
+        182  ns ± 1.4 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.91x
+      chunkedZipWith (2 blocks):    OK (3.15s)
+        186  ns ± 2.0 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.93x
+      chunkedZipWith (2 blocks, C): OK (0.80s)
+        184  ns ± 3.4 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.92x
+      chunkedZipWith (3 blocks):    OK (6.55s)
+        194  ns ± 1.6 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.97x
+      chunkedZipWith (3 blocks, C): OK (3.28s)
+        194  ns ± 3.4 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.97x
+    Bitwise AND, length 63
+      zipWith:                      OK (0.79s)
+        371  ns ± 6.4 ns, 2.8 KB allocated,   0 B  copied,  52 MB peak memory
+      packZipWith:                  OK (7.81s)
+        233  ns ± 918 ps, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.63x
+      chunkedZipWith (2 blocks):    OK (3.84s)
+        228  ns ± 2.2 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.61x
+      chunkedZipWith (2 blocks, C): OK (0.93s)
+        215  ns ± 3.2 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.58x
+      chunkedZipWith (3 blocks):    OK (3.84s)
+        226  ns ± 2.7 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.61x
+      chunkedZipWith (3 blocks, C): OK (0.96s)
+        223  ns ± 3.5 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.60x
+    Bitwise AND, length 127
+      zipWith:                      OK (1.43s)
+        665  ns ±  11 ns, 5.4 KB allocated,   1 B  copied,  52 MB peak memory
+      packZipWith:                  OK (0.91s)
+        209  ns ± 3.8 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.31x
+      chunkedZipWith (2 blocks):    OK (0.82s)
+        181  ns ± 2.8 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.27x
+      chunkedZipWith (2 blocks, C): OK (52.40s)
+        196  ns ± 2.6 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.29x
+      chunkedZipWith (3 blocks):    OK (0.87s)
+        191  ns ± 3.0 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.29x
+      chunkedZipWith (3 blocks, C): OK (3.26s)
+        190  ns ± 2.6 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.29x
+    Bitwise AND, length 255
+      zipWith:                      OK (0.75s)
+        1.38 μs ±  22 ns,  11 KB allocated,   3 B  copied,  52 MB peak memory
+      packZipWith:                  OK (4.49s)
+        263  ns ± 898 ps, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.19x
+      chunkedZipWith (2 blocks):    OK (1.77s)
+        204  ns ± 3.5 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 0.15x
+      chunkedZipWith (2 blocks, C): OK (1.81s)
+        208  ns ± 3.6 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 0.15x
+      chunkedZipWith (3 blocks):    OK (14.16s)
+        210  ns ± 940 ps, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.15x
+      chunkedZipWith (3 blocks, C): OK (3.41s)
+        197  ns ± 2.1 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.14x
+    Bitwise AND, length 511
+      zipWith:                      OK (1.42s)
+        2.67 μs ±  43 ns,  21 KB allocated,   2 B  copied,  52 MB peak memory
+      packZipWith:                  OK (1.48s)
+        332  ns ± 3.1 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.12x
+      chunkedZipWith (2 blocks):    OK (1.94s)
+        219  ns ± 3.0 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 0.08x
+      chunkedZipWith (2 blocks, C): OK (3.74s)
+        217  ns ± 2.2 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.08x
+      chunkedZipWith (3 blocks):    OK (1.04s)
+        230  ns ± 2.7 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.09x
+      chunkedZipWith (3 blocks, C): OK (1.90s)
+        214  ns ± 2.9 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 0.08x
+    Bitwise AND, length 1023
+      zipWith:                      OK (0.78s)
+        5.68 μs ± 100 ns,  42 KB allocated,  55 B  copied,  52 MB peak memory
+      packZipWith:                  OK (1.13s)
+        499  ns ± 7.9 ns, 341 B  allocated,  91 B  copied,  52 MB peak memory, 0.09x
+      chunkedZipWith (2 blocks):    OK (1.28s)
+        282  ns ± 3.7 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.05x
+      chunkedZipWith (2 blocks, C): OK (1.29s)
+        280  ns ± 3.9 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.05x
+      chunkedZipWith (3 blocks):    OK (1.38s)
+        297  ns ± 4.7 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.05x
+      chunkedZipWith (3 blocks, C): OK (1.29s)
+        283  ns ± 3.2 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.05x
+    Bitwise AND, length 2047
+      zipWith:                      OK (0.78s)
+        11.2 μs ± 170 ns,  85 KB allocated,  88 B  copied,  52 MB peak memory
+      packZipWith:                  OK (3.50s)
+        798  ns ± 4.1 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.07x
+      chunkedZipWith (2 blocks):    OK (1.76s)
+        385  ns ± 3.5 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.03x
+      chunkedZipWith (2 blocks, C): OK (6.57s)
+        379  ns ± 1.6 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.03x
+      chunkedZipWith (3 blocks):    OK (1.88s)
+        414  ns ± 4.8 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.04x
+      chunkedZipWith (3 blocks, C): OK (1.78s)
+        395  ns ± 3.1 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.04x
+```
+
+We observe that, up to length 15, the first approach comes out ahead, especially
+on smaller inputs. However, at around length 30, a 'phase transition' occurs,
+where the other approaches win out, with this becoming increasingly apparent as
+we get to the limits of our sizes. In particular, in the middle of the size
+range (between 63 and 511 bytes inclusive), other approaches beat out the naive
+one by a factor of between 2 and 10, which is not insignificant. We also note
+that the first approach allocates substantially more than the others, likely due
+to lists it cannot fuse away; all other approaches have fixed allocations.
+
+It is not clear from the above whether the second, third or fourth approaches
+are better in general; to this end, we ran only these in isolation, comparing
+the third and fourth approaches to the second:
+
+```
+All
+  Packed bitwise AND
+    Packed bitwise AND, length 31
+      packZipWith:                  OK (3.34s)
+        390  ns ± 2.8 ns, 3.0 KB allocated,   0 B  copied,  45 MB peak memory
+      chunkedZipWith (2 blocks):    OK (6.21s)
+        184  ns ± 2.7 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.47x
+      chunkedZipWith (2 blocks, C): OK (1.57s)
+        184  ns ± 1.5 ns, 427 B  allocated,  95 B  copied,  52 MB peak memory, 0.47x
+      chunkedZipWith (3 blocks):    OK (3.20s)
+        189  ns ± 1.7 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.48x
+      chunkedZipWith (3 blocks, C): OK (1.62s)
+        190  ns ± 2.8 ns, 427 B  allocated,  95 B  copied,  52 MB peak memory, 0.49x
+    Packed bitwise AND, length 63
+      packZipWith:                  OK (0.74s)
+        679  ns ±  12 ns, 5.8 KB allocated,   0 B  copied,  52 MB peak memory
+      chunkedZipWith (2 blocks):    OK (6.91s)
+        208  ns ± 2.2 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.31x
+      chunkedZipWith (2 blocks, C): OK (3.93s)
+        236  ns ± 1.3 ns, 430 B  allocated,  95 B  copied,  52 MB peak memory, 0.35x
+      chunkedZipWith (3 blocks):    OK (8.05s)
+        239  ns ± 796 ps, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.35x
+      chunkedZipWith (3 blocks, C): OK (8.10s)
+        242  ns ± 2.2 ns, 431 B  allocated,  95 B  copied,  52 MB peak memory, 0.36x
+    Packed bitwise AND, length 127
+      packZipWith:                  OK (0.73s)
+        1.40 μs ±  22 ns,  11 KB allocated,   1 B  copied,  52 MB peak memory
+      chunkedZipWith (2 blocks):    OK (50.76s)
+        188  ns ± 2.6 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.13x
+      chunkedZipWith (2 blocks, C): OK (1.67s)
+        191  ns ± 2.4 ns, 427 B  allocated,  95 B  copied,  52 MB peak memory, 0.14x
+      chunkedZipWith (3 blocks):    OK (0.84s)
+        193  ns ± 3.5 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.14x
+      chunkedZipWith (3 blocks, C): OK (0.84s)
+        191  ns ± 3.1 ns, 424 B  allocated,  94 B  copied,  52 MB peak memory, 0.14x
+    Packed bitwise AND, length 255
+      packZipWith:                  OK (2.85s)
+        2.69 μs ±  22 ns,  23 KB allocated,   7 B  copied,  52 MB peak memory
+      chunkedZipWith (2 blocks):    OK (6.91s)
+        204  ns ± 1.7 ns, 359 B  allocated,  95 B  copied,  52 MB peak memory, 0.08x
+      chunkedZipWith (2 blocks, C): OK (0.87s)
+        199  ns ± 2.9 ns, 424 B  allocated,  94 B  copied,  52 MB peak memory, 0.07x
+      chunkedZipWith (3 blocks):    OK (3.61s)
+        213  ns ± 1.4 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.08x
+      chunkedZipWith (3 blocks, C): OK (1.73s)
+        201  ns ± 2.5 ns, 427 B  allocated,  95 B  copied,  52 MB peak memory, 0.07x
+    Packed bitwise AND, length 511
+      packZipWith:                  OK (23.36s)
+        5.60 μs ±  89 ns,  45 KB allocated,  11 B  copied,  52 MB peak memory
+      chunkedZipWith (2 blocks):    OK (1.92s)
+        221  ns ± 4.2 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 0.04x
+      chunkedZipWith (2 blocks, C): OK (14.52s)
+        215  ns ± 3.8 ns, 431 B  allocated,  95 B  copied,  52 MB peak memory, 0.04x
+      chunkedZipWith (3 blocks):    OK (2.05s)
+        233  ns ± 4.3 ns, 353 B  allocated,  94 B  copied,  52 MB peak memory, 0.04x
+      chunkedZipWith (3 blocks, C): OK (7.27s)
+        214  ns ± 1.5 ns, 431 B  allocated,  95 B  copied,  52 MB peak memory, 0.04x
+    Packed bitwise AND, length 1023
+      packZipWith:                  OK (0.74s)
+        11.2 μs ± 170 ns,  90 KB allocated,  30 B  copied,  52 MB peak memory
+      chunkedZipWith (2 blocks):    OK (4.85s)
+        282  ns ± 1.8 ns, 358 B  allocated,  95 B  copied,  52 MB peak memory, 0.03x
+      chunkedZipWith (2 blocks, C): OK (4.84s)
+        280  ns ± 752 ps, 430 B  allocated,  95 B  copied,  52 MB peak memory, 0.02x
+      chunkedZipWith (3 blocks):    OK (1.36s)
+        305  ns ± 3.0 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.03x
+      chunkedZipWith (3 blocks, C): OK (2.49s)
+        283  ns ± 3.3 ns, 427 B  allocated,  95 B  copied,  52 MB peak memory, 0.03x
+    Packed bitwise AND, length 2047
+      packZipWith:                  OK (2.95s)
+        22.3 μs ± 350 ns, 181 KB allocated, 114 B  copied,  52 MB peak memory
+      chunkedZipWith (2 blocks):    OK (1.79s)
+        394  ns ± 2.7 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.02x
+      chunkedZipWith (2 blocks, C): OK (3.43s)
+        390  ns ± 6.4 ns, 427 B  allocated,  95 B  copied,  52 MB peak memory, 0.02x
+      chunkedZipWith (3 blocks):    OK (1.87s)
+        410  ns ± 5.8 ns, 347 B  allocated,  92 B  copied,  52 MB peak memory, 0.02x
+      chunkedZipWith (3 blocks, C): OK (0.94s)
+        392  ns ± 5.5 ns, 405 B  allocated,  90 B  copied,  52 MB peak memory, 0.02x
+```
+
+We observe that the third and fourth approaches beat out the second by a factor
+of at least 2, with said factor increasing to ~50 towards the largest inputs.
+However, there doesn't appear to be much difference between the third and fourth
+approaches. Additionally, the C-implemented versions do not out-perform their
+Haskell equivalents by a worthwhile margin, while marginally increasing
+allocations.
+
+### Bitwise complement
+
+The results of our benchmarks given the methodology we described are below.
+Throughout, `map` refers to the first approach, `chunkedMap (2 blocks)` refers
+to the second approach, and `chunkMap (3 blocks)` refers to the third approach.
+We also mark the C implementation of the second approach. All multipliers are
+shown as multiples of the running time of the first approach on the same length
+of data.
+
+```
+All
+  Bitwise complement
+    Bitwise complement, length 1
+      map:                      OK (26.39s)
+        24.0 ns ± 464 ps, 111 B  allocated,   0 B  copied,  45 MB peak memory
+      chunkedMap (2 blocks):    OK (26.34s)
+        199  ns ± 3.7 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 8.31x
+      chunkedMap (2 blocks, C): OK (3.47s)
+        205  ns ± 1.4 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 8.56x
+      chunkMap (3 blocks):      OK (1.77s)
+        206  ns ± 4.1 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 8.60x
+    Bitwise complement, length 3
+      map:                      OK (0.97s)
+        28.0 ns ± 334 ps, 119 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (3.53s)
+        209  ns ± 1.4 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 7.46x
+      chunkedMap (2 blocks, C): OK (3.54s)
+        210  ns ± 2.9 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 7.47x
+      chunkMap (3 blocks):      OK (3.62s)
+        214  ns ± 2.5 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 7.62x
+    Bitwise complement, length 7
+      map:                      OK (1.03s)
+        29.8 ns ± 410 ps, 119 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (1.76s)
+        207  ns ± 1.5 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 6.93x
+      chunkedMap (2 blocks, C): OK (7.16s)
+        213  ns ± 1.2 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 7.14x
+      chunkMap (3 blocks):      OK (3.61s)
+        214  ns ± 4.0 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 7.18x
+    Bitwise complement, length 15
+      map:                      OK (2.26s)
+        33.4 ns ± 222 ps, 127 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (3.61s)
+        214  ns ± 1.6 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 6.40x
+      chunkedMap (2 blocks, C): OK (7.34s)
+        219  ns ± 2.7 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 6.54x
+      chunkMap (3 blocks):      OK (123.63s)
+        234  ns ±  12 ns, 343 B  allocated,  96 B  copied,  53 MB peak memory, 7.01x
+    Bitwise complement, length 31
+      map:                      OK (1.36s)
+        40.3 ns ± 548 ps, 143 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (122.95s)
+        232  ns ± 806 ps, 343 B  allocated,  96 B  copied,  53 MB peak memory, 5.75x
+      chunkedMap (2 blocks, C): OK (8.32s)
+        247  ns ± 3.4 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 6.13x
+      chunkMap (3 blocks):      OK (2.08s)
+        245  ns ± 4.3 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 6.09x
+    Bitwise complement, length 63
+      map:                      OK (0.94s)
+        54.2 ns ± 748 ps, 173 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (144.17s)
+        276  ns ± 6.8 ns, 343 B  allocated,  96 B  copied,  53 MB peak memory, 5.10x
+      chunkedMap (2 blocks, C): OK (1.17s)
+        270  ns ± 4.2 ns, 330 B  allocated,  92 B  copied,  53 MB peak memory, 4.99x
+      chunkMap (3 blocks):      OK (2.40s)
+        281  ns ± 2.2 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 5.19x
+    Bitwise complement, length 127
+      map:                      OK (0.69s)
+        78.7 ns ± 1.4 ns, 235 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (5.88s)
+        174  ns ± 1.4 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 2.21x
+      chunkedMap (2 blocks, C): OK (5.73s)
+        169  ns ± 2.9 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 2.15x
+      chunkMap (3 blocks):      OK (2.89s)
+        169  ns ± 2.0 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 2.15x
+    Bitwise complement, length 255
+      map:                      OK (0.96s)
+        111  ns ± 1.4 ns, 362 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (1.55s)
+        176  ns ± 2.1 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 1.58x
+      chunkedMap (2 blocks, C): OK (2.90s)
+        166  ns ± 2.5 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 1.49x
+      chunkMap (3 blocks):      OK (2.98s)
+        173  ns ± 1.5 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 1.56x
+    Bitwise complement, length 511
+      map:                      OK (0.77s)
+        175  ns ± 3.4 ns, 611 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (0.89s)
+        195  ns ± 2.7 ns, 330 B  allocated,  92 B  copied,  53 MB peak memory, 1.11x
+      chunkedMap (2 blocks, C): OK (12.96s)
+        191  ns ± 186 ps, 343 B  allocated,  95 B  copied,  53 MB peak memory, 1.09x
+      chunkMap (3 blocks):      OK (1.70s)
+        192  ns ± 3.7 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 1.10x
+    Bitwise complement, length 1023
+      map:                      OK (1.34s)
+        311  ns ± 2.8 ns, 1.1 KB allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (2.19s)
+        246  ns ± 2.0 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 0.79x
+      chunkedMap (2 blocks, C): OK (8.14s)
+        236  ns ± 2.8 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 0.76x
+      chunkMap (3 blocks):      OK (2.23s)
+        248  ns ± 4.5 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 0.80x
+    Bitwise complement, length 2047
+      map:                      OK (0.66s)
+        592  ns ±  11 ns, 2.1 KB allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (92.07s)
+        342  ns ± 568 ps, 343 B  allocated,  96 B  copied,  53 MB peak memory, 0.58x
+      chunkedMap (2 blocks, C): OK (21.84s)
+        318  ns ± 780 ps, 343 B  allocated,  95 B  copied,  53 MB peak memory, 0.54x
+      chunkMap (3 blocks):      OK (6.18s)
+        354  ns ± 6.3 ns, 341 B  allocated,  95 B  copied,  53 MB peak memory, 0.60x
+```
+
+These results show that until the input length becomes significantly long
+(around 1KiB), the first approach is much better (as much as a factor of 8). We
+do see some improvement at the upper end of our sizes, but this amounts to about
+a factor of 2 at most. The C implementation of the second approach does not seem
+to give significant speedups; the third approach appears slower than the second
+for all tested sizes.
+
+To establish where the 'phase transition' between the first and second
+approaches happens, we ran further benchmarks, limiting our sizes to the space
+between 511 and 1023 bytes:
+
+```
+All
+  Bitwise complement probe
+    Bitwise complement probe, length 511
+      map:                      OK (4.99s)
+        147  ns ± 2.2 ns, 621 B  allocated,   0 B  copied,  45 MB peak memory
+      chunkedMap (2 blocks):    OK (1.64s)
+        182  ns ± 2.5 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 1.24x
+      chunkedMap (2 blocks, C): OK (1.61s)
+        181  ns ± 1.3 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 1.23x
+      chunkMap (3 blocks):      OK (0.84s)
+        183  ns ± 3.1 ns, 330 B  allocated,  92 B  copied,  53 MB peak memory, 1.25x
+    Bitwise complement probe, length 767
+      map:                      OK (0.88s)
+        205  ns ± 2.7 ns, 875 B  allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (0.95s)
+        206  ns ± 2.6 ns, 330 B  allocated,  92 B  copied,  53 MB peak memory, 1.00x
+      chunkedMap (2 blocks, C): OK (6.91s)
+        202  ns ± 1.1 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 0.99x
+      chunkMap (3 blocks):      OK (1.85s)
+        208  ns ± 3.1 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 1.02x
+    Bitwise complement probe, length 1023
+      map:                      OK (1.26s)
+        295  ns ± 4.5 ns, 1.1 KB allocated,   0 B  copied,  53 MB peak memory
+      chunkedMap (2 blocks):    OK (63.98s)
+        238  ns ± 292 ps, 343 B  allocated,  95 B  copied,  53 MB peak memory, 0.81x
+      chunkedMap (2 blocks, C): OK (7.89s)
+        228  ns ± 3.2 ns, 343 B  allocated,  95 B  copied,  53 MB peak memory, 0.77x
+      chunkMap (3 blocks):      OK (2.15s)
+        242  ns ± 1.5 ns, 339 B  allocated,  94 B  copied,  53 MB peak memory, 0.82x
+```
+
+We note that at 767 bytes (exactly mid-way), the 'phase transition' has already
+occurred.
+
+### Population count
+
+The results of our benchmark given the methodology we described are below.
+Throughout, `foldl'` refers to the first approach, `chunkPopCount2 to the second
+approach, and `chunkPopCount3` to the third approach. All multipliers are shown
+as multiples of the running time of the first approach on the same length of
+data.
+
+```
+All
+  Popcount
+    Popcount, length 1
+      foldl':         OK (49.30s)
+        22.9 ns ± 348 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (3.17s)
+        23.3 ns ± 456 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.02x
+      chunkPopCount3: OK (0.81s)
+        24.1 ns ± 330 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.05x
+    Popcount, length 3
+      foldl':         OK (0.92s)
+        27.1 ns ± 372 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (113.34s)
+        26.7 ns ±  64 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.98x
+      chunkPopCount3: OK (0.91s)
+        26.8 ns ± 372 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.99x
+    Popcount, length 7
+      foldl':         OK (4.05s)
+        30.3 ns ± 226 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (8.52s)
+        31.9 ns ±  40 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.05x
+      chunkPopCount3: OK (1.05s)
+        31.2 ns ± 322 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.03x
+    Popcount, length 15
+      foldl':         OK (0.69s)
+        40.4 ns ± 682 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (4.39s)
+        32.8 ns ± 100 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.81x
+      chunkPopCount3: OK (0.57s)
+        33.6 ns ± 638 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.83x
+    Popcount, length 31
+      foldl':         OK (2.08s)
+        61.9 ns ± 420 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (81.00s)
+        37.8 ns ± 222 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.61x
+      chunkPopCount3: OK (0.64s)
+        37.6 ns ± 688 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.61x
+    Popcount, length 63
+      foldl':         OK (0.83s)
+        99.5 ns ± 1.8 ns,  38 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (0.77s)
+        45.2 ns ± 778 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.45x
+      chunkPopCount3: OK (0.75s)
+        44.7 ns ± 666 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.45x
+    Popcount, length 127
+      foldl':         OK (0.68s)
+        161  ns ± 2.7 ns,  31 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (1.09s)
+        64.0 ns ± 650 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.40x
+      chunkPopCount3: OK (1.12s)
+        66.7 ns ± 696 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.41x
+    Popcount, length 255
+      foldl':         OK (0.60s)
+        287  ns ± 5.7 ns,  25 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (0.89s)
+        104  ns ± 1.7 ns,  38 B  allocated,   0 B  copied,  47 MB peak memory, 0.36x
+      chunkPopCount3: OK (0.86s)
+        101  ns ± 1.6 ns,  38 B  allocated,   0 B  copied,  47 MB peak memory, 0.35x
+    Popcount, length 511
+      foldl':         OK (0.58s)
+        538  ns ±  11 ns,   0 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (0.77s)
+        181  ns ± 2.9 ns,  31 B  allocated,   0 B  copied,  47 MB peak memory, 0.34x
+      chunkPopCount3: OK (0.72s)
+        167  ns ± 2.8 ns,  31 B  allocated,   0 B  copied,  47 MB peak memory, 0.31x
+    Popcount, length 1023
+      foldl':         OK (1.11s)
+        1.04 μs ±  11 ns,   0 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (0.73s)
+        341  ns ± 5.3 ns,  25 B  allocated,   0 B  copied,  47 MB peak memory, 0.33x
+      chunkPopCount3: OK (1.28s)
+        301  ns ± 2.7 ns,  31 B  allocated,   0 B  copied,  47 MB peak memory, 0.29x
+    Popcount, length 2047
+      foldl':         OK (1.09s)
+        2.05 μs ±  28 ns,   0 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount2: OK (0.69s)
+        649  ns ±  12 ns,   0 B  allocated,   0 B  copied,  47 MB peak memory, 0.32x
+      chunkPopCount3: OK (1.20s)
+        568  ns ± 6.7 ns,  25 B  allocated,   0 B  copied,  47 MB peak memory, 0.28x
+```
+
+We observe that, even for short inputs, the time required by the second and
+third approach is not significantly worse than the first: the difference is at
+most 5%, which at the scale being measured is barely distinct from noise. Once
+the length reaches 15, there is about a 20% improvement in running time when
+using the second and third approaches relative the first, and for lengths larger
+than this, the increase only continues. Overall, the third approach does not
+appear significantly better than the second until the input size reaches 511,
+but isn't significantly worse at lengths above 15. To more clearly see the
+difference, we also ran the same inputs, but compared the second approach to the
+third:
+
+```
+All
+  Block popcount
+    Block popcount, length 1
+      chunkPopCount2: OK (0.74s)
+        20.5 ns ± 404 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (3.12s)
+        23.3 ns ± 312 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.14x
+    Block popcount, length 3
+      chunkPopCount2: OK (3.31s)
+        25.1 ns ± 134 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (1.70s)
+        25.4 ns ± 352 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.01x
+    Block popcount, length 7
+      chunkPopCount2: OK (1.02s)
+        30.2 ns ± 456 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (1.08s)
+        32.2 ns ± 432 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.07x
+    Block popcount, length 15
+      chunkPopCount2: OK (1.09s)
+        32.2 ns ± 366 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (1.06s)
+        31.1 ns ± 400 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.97x
+    Block popcount, length 31
+      chunkPopCount2: OK (0.60s)
+        35.2 ns ± 684 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (0.61s)
+        36.3 ns ± 642 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 1.03x
+    Block popcount, length 63
+      chunkPopCount2: OK (0.76s)
+        45.0 ns ± 772 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (2.96s)
+        44.1 ns ± 174 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.98x
+    Block popcount, length 127
+      chunkPopCount2: OK (1.05s)
+        62.0 ns ± 740 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (1.00s)
+        59.3 ns ± 986 ps,  39 B  allocated,   0 B  copied,  47 MB peak memory, 0.96x
+    Block popcount, length 255
+      chunkPopCount2: OK (0.85s)
+        100  ns ± 1.3 ns,  38 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (0.80s)
+        93.4 ns ± 1.3 ns,  38 B  allocated,   0 B  copied,  47 MB peak memory, 0.93x
+    Block popcount, length 511
+      chunkPopCount2: OK (0.75s)
+        175  ns ± 2.7 ns,  31 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (0.68s)
+        160  ns ± 2.7 ns,  31 B  allocated,   0 B  copied,  47 MB peak memory, 0.91x
+    Block popcount, length 1023
+      chunkPopCount2: OK (0.70s)
+        330  ns ± 5.2 ns,  25 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (0.63s)
+        294  ns ± 5.5 ns,  25 B  allocated,   0 B  copied,  47 MB peak memory, 0.89x
+    Block popcount, length 2047
+      chunkPopCount2: OK (1.32s)
+        624  ns ± 5.6 ns,  25 B  allocated,   0 B  copied,  47 MB peak memory
+      chunkPopCount3: OK (0.60s)
+        562  ns ±  10 ns,   0 B  allocated,   0 B  copied,  47 MB peak memory, 0.90x
+```
+
+We can see that the benefits of the third approach versus the second amount to
+10% better performance at most, and even that only occurs at length 511 and
+higher. At the same time, for lengths below 15, there can be up to a 15% penalty
+for using the third approach over the second.
+
+## Conclusion and recommendations
+
+We do not believe the use of the FFI and implementing any operations in
+(portable C) to be worthwhile; there appear to be no significant gains in speed, 
+and GHC appears able to generate code competitive with the C compiler.
+
+For bitwise binary operations, we recommend a 'hybrid' approach: for smaller
+input lengths (less than 30 items), we use the first (naive) approach, while for
+anything larger, we use the third approach (with `Word64`-width
+bit-parallelism). This would give us good performance on both small and large
+inputs, and would not require a significant overhead, as we have to verify that
+the lengths of our inputs match anyway.
+
+For bitwise complement, we also recommend a 'hybrid' approach: for input lengths
+less than 760 bytes, we use the first (naive) approach, while for anything
+larger, we use the second approach (with `Word64`-width bit-parallelism). While
+this would require some overhead for a length check, we believe that it's
+worthwhile, as the length of an input `ByteString` is statically known. However,
+if we consider inputs of this length unlikely relative the extra code path,
+using the first approach in all cases is acceptable; however, we don't believe
+that the extra code represents significant maintenance or runtime overheads, and
+while inputs of this size would be unlikely, they're not impossible.
+
+Popcount should be implemented using the second (`Word64`-width bit-parallel)
+approach only. The first (naive) approach is not significantly better at any
+length, and the third (`Word256`-width bit parallel) approach only out-performs
+the second by a small margin for large inputs. While a 'hybrid' approach for
+this operation may be possible in theory, the benefits relative the extra code
+and its maintenance don't appear worthwhile.
+
+## Future work
+
+Many, if not most, of the operations here can be significantly accelerated using
+[SIMD
+instructions](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data).
+This is because many of the operations are bit-parallel or monoidal (bitwise
+binary operations and population counting) while others can benefit from
+instruction-level parallelism and wider words, as well as specialized
+instructions. Given that this code will be run on Cardano nodes, which are
+likely to be x86-64 machines with recent instruction sets, the gains are
+potentially significant; at the same time, this would require significant extra
+build-time checks, as well as fallbacks when said instructions are not
+available.
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/Binary.hs b/plutus-core/plutus-core/bench/bitwise/Benches/Binary.hs
new file mode 100644
index 00000000000..9150ee42436
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/Binary.hs
@@ -0,0 +1,88 @@
+module Benches.Binary (
+  benches
+  ) where
+
+import Control.Monad (guard)
+import Data.Bits ((.&.))
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal (fromForeignPtr, mallocByteString)
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import Data.Foldable (for_)
+import Data.Word (Word8)
+import DataGen (mkBinaryArgs, noCleanup, sizes)
+import Foreign.C.Types (CSize (CSize), CUChar)
+import Foreign.ForeignPtr (castForeignPtr, withForeignPtr)
+import Foreign.Ptr (Ptr, castPtr)
+import Foreign.Storable (peekByteOff, pokeByteOff)
+import GHC.Exts (fromList)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic AND" $ benchBasic "Basic AND" <$> sizes
+
+-- Helpers
+
+-- Benchmark a naive Haskell implementation against a clone of packZipWith and a
+-- naive C one.
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkBinaryArgs len) noCleanup $ \xs ->
+    let naiveLabel = "zipWith"
+        packedLabel = "packedZipWith"
+        cnaiveLabel = "naive C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ wrap usingZW <$> xs,
+        bcompare matchLabel . bench packedLabel . nfIO $ wrap usingPZW <$> xs,
+        bcompare matchLabel . bench cnaiveLabel . nfIO $ wrap usingCNaive <$> xs
+        ]
+
+-- Saves repeatedly doing the same thing
+wrap ::
+  (ByteString -> ByteString -> ByteString) ->
+  (ByteString, ByteString) ->
+  Maybe ByteString
+wrap f (bs1, bs2) = do
+  guard (BS.length bs2 == len)
+  pure . f bs1 $ bs2
+  where
+    len :: Int
+    len = BS.length bs1
+
+usingZW :: ByteString -> ByteString -> ByteString
+usingZW bs = fromList . BS.zipWith (.&.) bs
+
+usingPZW :: ByteString -> ByteString -> ByteString
+usingPZW bs1 bs2 = unsafeDupablePerformIO .
+  unsafeUseAsCStringLen bs1 $ \(ptr1, len) ->
+    unsafeUseAsCStringLen bs2 $ \(ptr2, _) -> do
+      fp <- mallocByteString len
+      withForeignPtr fp $ \dst -> for_ [0 .. len - 1] $ \i -> do
+        b1 :: Word8 <- peekByteOff ptr1 i
+        b2 <- peekByteOff ptr2 i
+        pokeByteOff dst i $ b1 .&. b2
+      pure . fromForeignPtr (castForeignPtr fp) 0 $ len
+
+usingCNaive :: ByteString -> ByteString -> ByteString
+usingCNaive bs1 bs2 = unsafeDupablePerformIO .
+  unsafeUseAsCStringLen bs1 $ \(ptr1, len) ->
+    unsafeUseAsCStringLen bs2 $ \(ptr2, _) -> do
+      fp <- mallocByteString len
+      withForeignPtr fp $ \dst ->
+        cAndNaive dst (castPtr ptr1) (castPtr ptr2) (fromIntegral len)
+      pure . fromForeignPtr (castForeignPtr fp) 0 $ len
+
+foreign import ccall unsafe "cbits.h c_and_naive"
+  cAndNaive ::
+    Ptr CUChar ->
+    Ptr CUChar ->
+    Ptr CUChar ->
+    CSize ->
+    IO ()
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/BitRead.hs b/plutus-core/plutus-core/bench/bitwise/Benches/BitRead.hs
new file mode 100644
index 00000000000..19a576d83f3
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/BitRead.hs
@@ -0,0 +1,63 @@
+module Benches.BitRead (
+  benches
+  ) where
+
+import Control.Monad (guard)
+import Data.Bits (shiftR, testBit)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import DataGen (mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CBool (CBool), CSize (CSize), CUChar)
+import Foreign.Ptr (Ptr, castPtr)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic bit read" $ benchBasic "Basic bit read" <$> sizes
+
+-- Helpers
+
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "ByteString ops"
+        cLabel = "C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ bitAt (len - 1) <$> xs,
+        bcompare matchLabel . bench cLabel . nfIO $ wrapper (len - 1) <$> xs
+        ]
+
+bitAt :: Int -> ByteString -> Maybe Bool
+bitAt ix bs = do
+  guard (ix >= 0)
+  guard (ix < bitLength)
+  let (bigIx, smallIx) = ix `quotRem` 8
+  let byte = BS.index bs bigIx
+  pure . testBit byte $ shiftR 0x80 smallIx
+  where
+    bitLength :: Int
+    bitLength = BS.length bs * 8
+
+wrapper :: Int -> ByteString -> Maybe Bool
+wrapper ix bs = do
+  guard (ix >= 0)
+  guard (ix <= bitLength)
+  let CBool res = unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(ptr, _) ->
+                    pure . cBitAt (fromIntegral ix) . castPtr $ ptr
+  pure $ res /= 0
+  where
+    bitLength :: Int
+    bitLength = BS.length bs * 8
+
+foreign import ccall unsafe "cbits.h c_bit_at"
+  cBitAt ::
+    CSize ->
+    Ptr CUChar ->
+    CBool
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/BitWrite.hs b/plutus-core/plutus-core/bench/bitwise/Benches/BitWrite.hs
new file mode 100644
index 00000000000..5d2180ac5d5
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/BitWrite.hs
@@ -0,0 +1,99 @@
+module Benches.BitWrite (
+  benches
+  ) where
+
+import Control.Monad (guard)
+import Data.Bits (clearBit, setBit, shiftR)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal (fromForeignPtr, mallocByteString)
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import Data.Word (Word8)
+import DataGen (mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CBool (CBool), CSize (CSize), CUChar)
+import Foreign.ForeignPtr (castForeignPtr, withForeignPtr)
+import Foreign.Ptr (Ptr, castPtr)
+import GHC.Exts (fromList, toList)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Worst-case bit write" $ benchBasic "Worst-case bit write" <$> sizes
+
+-- Helpers
+
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "ByteString ops"
+        cnaiveLabel = "Naive C"
+        cmemcpyLabel = "Memcpy C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ bitSet False (len - 1) <$> xs,
+        bcompare matchLabel . bench cnaiveLabel . nfIO
+          $ wrapper cBitSetNaive False (len - 1) <$> xs,
+        bcompare matchLabel . bench cmemcpyLabel . nfIO
+          $ wrapper cBitSetMemcpy False (len - 1) <$> xs
+        ]
+
+bitSet :: Bool -> Int -> ByteString -> Maybe ByteString
+bitSet b ix bs = do
+  guard (ix >= 0)
+  guard (ix < bitLength)
+  pure . fromList . fmap (uncurry go) . zip [0 ..] . toList $ bs
+  where
+    go :: Int -> Word8 -> Word8
+    go candidateIx w8
+      | candidateIx /= bigIx = w8
+      | b = setBit w8 $ shiftR 0x80 smallIx
+      | otherwise = clearBit w8 $ shiftR 0x80 smallIx
+    bitLength :: Int
+    bitLength = BS.length bs * 8
+    bigIx :: Int
+    bigIx = ix `quot` 8
+    smallIx :: Int
+    smallIx = ix `rem` 8
+
+wrapper ::
+  (CBool -> CSize -> Ptr CUChar -> Ptr CUChar -> CSize -> IO ()) ->
+  Bool ->
+  Int ->
+  ByteString ->
+  Maybe ByteString
+wrapper f b ix bs = do
+  guard (ix >= 0)
+  guard (ix < bitLength)
+  pure . unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(src, len) -> do
+    fp <- mallocByteString len
+    withForeignPtr fp $ \dst ->
+      if b
+      then f (CBool 1) (fromIntegral ix) dst (castPtr src) . fromIntegral $ len
+      else f (CBool 0) (fromIntegral ix) dst (castPtr src) . fromIntegral $ len
+    pure . fromForeignPtr (castForeignPtr fp) 0 $ len
+  where
+    bitLength :: Int
+    bitLength = BS.length bs * 8
+
+foreign import ccall unsafe "cbits.h c_bit_set_naive"
+  cBitSetNaive ::
+    CBool ->
+    CSize ->
+    Ptr CUChar ->
+    Ptr CUChar ->
+    CSize ->
+    IO ()
+
+foreign import ccall unsafe "cbits.h c_bit_set_memcpy"
+  cBitSetMemcpy ::
+    CBool ->
+    CSize ->
+    Ptr CUChar ->
+    Ptr CUChar ->
+    CSize ->
+    IO ()
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/Complement.hs b/plutus-core/plutus-core/bench/bitwise/Benches/Complement.hs
new file mode 100644
index 00000000000..edc26420ef4
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/Complement.hs
@@ -0,0 +1,51 @@
+-- editorconfig-checker-disable-file
+module Benches.Complement (
+  benches
+  ) where
+
+import Data.Bits (complement)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal (fromForeignPtr, mallocByteString)
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import DataGen (mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CSize (CSize), CUChar)
+import Foreign.ForeignPtr (castForeignPtr, withForeignPtr)
+import Foreign.Ptr (Ptr, castPtr)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic complement" $ benchBasic "Basic complement" <$> sizes
+
+-- Helpers
+
+-- Benchmark a naive Haskell implementation against the C one
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "map"
+        cnaiveLabel = "naive C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ BS.map complement <$> xs,
+        bcompare matchLabel . bench cnaiveLabel . nfIO $ xs >>= wrapping cComplementNaive
+        ]
+
+-- Avoids having to rewrap C complement ops tediously each time
+wrapping :: (Ptr CUChar -> Ptr CUChar -> CSize -> IO ()) -> ByteString -> IO ByteString
+wrapping f bs = unsafeUseAsCStringLen bs $ \(ptr, len) -> do
+  fp <- mallocByteString len
+  withForeignPtr fp $ \dst -> f dst (castPtr ptr) (fromIntegral len)
+  pure . fromForeignPtr (castForeignPtr fp) 0 $ len
+
+foreign import ccall unsafe "cbits.h c_complement_naive"
+  cComplementNaive ::
+    Ptr CUChar ->
+    Ptr CUChar ->
+    CSize ->
+    IO ()
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/Convert.hs b/plutus-core/plutus-core/bench/bitwise/Benches/Convert.hs
new file mode 100644
index 00000000000..711fa92e18e
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/Convert.hs
@@ -0,0 +1,164 @@
+{-# LANGUAGE BangPatterns #-}
+
+module Benches.Convert (
+  benchesBSToI,
+  benchesIToBS,
+  ) where
+
+import Control.Monad (guard)
+import Data.Bits (unsafeShiftL, zeroBits)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import Data.Word (Word16, Word32, Word64, Word8)
+import DataGen (mkInteger, mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CChar)
+import Foreign.Ptr (Ptr)
+import Foreign.Storable (peekByteOff)
+import GHC.Exts (fromList)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benchesBSToI :: Benchmark
+benchesBSToI = bgroup "Basic ByteString to Integer conversion" $
+  benchBSToI "Basic ByteString to Integer conversion" <$> sizes
+
+benchesIToBS :: Benchmark
+benchesIToBS = bgroup "Basic Integer to ByteString conversion" $
+  benchIToBS "Basic Integer to ByteString conversion" <$> sizes
+
+-- Helpers
+
+benchBSToI ::
+  String ->
+  Int ->
+  Benchmark
+benchBSToI mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "scan backwards"
+        forwardsLabel = "scan forwards"
+        shiftLabel = "scan backwards with shifts"
+        blockLabel = "scan backwards in blocks with shifts"
+        forwardsShiftLabel = "scan forwards with shifts"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ bsToI <$> xs,
+        bcompare matchLabel . bench forwardsLabel . nfIO $ bsToIForward <$> xs,
+        bcompare matchLabel . bench shiftLabel . nfIO $ bsToIShift <$> xs,
+        bcompare matchLabel . bench forwardsShiftLabel . nfIO $ bsToIShiftForward <$> xs,
+        bcompare matchLabel . bench blockLabel . nfIO $ bsToIShiftBlock <$> xs
+        ]
+
+benchIToBS ::
+  String ->
+  Int ->
+  Benchmark
+benchIToBS mainLabel len =
+  withResource (mkInteger len) noCleanup $ \i ->
+    let naiveLabel = "naive"
+        testLabel = mainLabel <> ", length " <> show len in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ iToBS <$> i
+        ]
+
+-- Implementations
+
+iToBS :: Integer -> Maybe ByteString
+iToBS i = case signum i of
+  (-1) -> Nothing
+  0 -> pure . BS.singleton $ zeroBits
+  _ -> pure $ if i < 256
+              then BS.singleton . fromIntegral $ i
+              else fromList . go [] $ i
+  where
+    go :: [Word8] -> Integer -> [Word8]
+    go acc !j = case j `quotRem` 256 of
+      (0, r) -> fromIntegral r : acc -- we're done
+      (d, r) -> go (fromIntegral r : acc) d
+
+bsToI :: ByteString -> Maybe Integer
+bsToI bs = do
+  guard (len > 0)
+  pure . go 0 1 $ len - 1
+  where
+    len :: Int
+    len = BS.length bs
+    go :: Integer -> Integer -> Int -> Integer
+    go !acc !mult !ix = let limb :: Integer = fromIntegral . BS.index bs $ ix
+                            limbValue = limb * mult
+                            acc' = acc + limbValue in
+                      if ix == 0
+                      then acc'
+                      else go acc' (mult * 256) $ ix - 1
+
+bsToIForward :: ByteString -> Maybe Integer
+bsToIForward bs = do
+  guard (len > 0)
+  pure . snd . BS.foldl' go (256 ^ (len - 1), 0) $ bs
+  where
+    len :: Int
+    len = BS.length bs
+    go :: (Integer, Integer) -> Word8 -> (Integer, Integer)
+    go (mult, acc) w8 = let limbValue = fromIntegral w8 * mult in
+      (mult `quot` 256, acc + limbValue)
+
+bsToIShift :: ByteString -> Maybe Integer
+bsToIShift bs = do
+  guard (len > 0)
+  pure . go 0 0 $ len - 1
+  where
+    len :: Int
+    len = BS.length bs
+    go :: Integer -> Int -> Int -> Integer
+    go !acc !shift !ix = let limb :: Integer = fromIntegral . BS.index bs $ ix
+                             limbValue = limb `unsafeShiftL` shift
+                             acc' = acc + limbValue in
+                      if ix == 0
+                      then acc'
+                      else go acc' (shift + 8) $ ix - 1
+
+bsToIShiftBlock :: ByteString -> Maybe Integer
+bsToIShiftBlock bs = do
+  guard (len > 0)
+  pure . unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(p, _) -> do
+    go p 0 0 $ len - 1
+  where
+    len :: Int
+    len = BS.length bs
+    go :: Ptr CChar -> Integer -> Int -> Int -> IO Integer
+    go p !acc !shift !ix
+      | ix >= 7 = do
+          w64 :: Word64 <- peekByteOff p (ix - 7)
+          let limb :: Integer = fromIntegral w64
+          let limbValue = limb `unsafeShiftL` shift
+          go p (acc + limbValue) (shift + 64) $ ix - 8
+      | ix >= 3 = do
+          w32 :: Word32 <- peekByteOff p (ix - 3)
+          let limb :: Integer = fromIntegral w32
+          let limbValue = limb `unsafeShiftL` shift
+          go p (acc + limbValue) (shift + 32) $ ix - 4
+      | ix >= 1 = do
+          w16 :: Word16 <- peekByteOff p (ix - 1)
+          let limb :: Integer = fromIntegral w16
+          let limbValue = limb `unsafeShiftL` shift
+          go p (acc + limbValue) (shift + 16) $ ix - 2
+      | ix == 0 = do
+          w8 :: Word8 <- peekByteOff p ix
+          let limb :: Integer = fromIntegral w8
+          let limbValue = limb `unsafeShiftL` shift
+          pure $ acc + limbValue
+      | otherwise = pure acc
+
+bsToIShiftForward :: ByteString -> Maybe Integer
+bsToIShiftForward bs = do
+  guard (len > 0)
+  pure . snd . BS.foldl' go ((len - 1) * 8, 0) $ bs
+  where
+    len :: Int
+    len = BS.length bs
+    go :: (Int, Integer) -> Word8 -> (Int, Integer)
+    go (shift, acc) w8 = let limbValue = fromIntegral w8 `unsafeShiftL` shift in
+      (shift - 8, acc + limbValue)
+
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/CountLeadingZeroes.hs b/plutus-core/plutus-core/bench/bitwise/Benches/CountLeadingZeroes.hs
new file mode 100644
index 00000000000..bbbaec25a13
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/CountLeadingZeroes.hs
@@ -0,0 +1,87 @@
+module Benches.CountLeadingZeroes (
+  benches,
+  cBenches,
+  ) where
+
+import Data.Bits (countLeadingZeros, zeroBits)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import Data.Maybe (fromMaybe)
+import DataGen (mkZeroesOne, noCleanup, sizes)
+import Foreign.C.Types (CSize (CSize), CUChar)
+import Foreign.Ptr (Ptr, castPtr)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic CLZ" $ benchBasic "CLZ" <$> sizes
+
+cBenches :: Benchmark
+cBenches = bgroup "C CLZ" $ benchC "C CLZ" <$> sizes
+
+-- Helpers
+
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkZeroesOne len) noCleanup $ \xs ->
+    let naiveLabel = "ByteString ops"
+        cnaiveLabel = "naive C"
+        cblockLabel = "block C"
+        cunrolledLabel = "unrolled C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ naiveClz <$> xs,
+        bcompare matchLabel . bench cnaiveLabel . nfIO $ xs >>= wrapping cClzNaive,
+        bcompare matchLabel . bench cblockLabel . nfIO $ xs >>= wrapping cClzBlock,
+        bcompare matchLabel . bench cunrolledLabel . nfIO $ xs >>= wrapping cClzBlockUnrolled
+      ]
+
+benchC ::
+  String ->
+  Int ->
+  Benchmark
+benchC mainLabel len =
+  withResource (mkZeroesOne len) noCleanup $ \xs ->
+    let cnaiveLabel = "naive C"
+        cblockLabel = "block C"
+        cunrolledLabel = "unrolled C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> cnaiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench cnaiveLabel . nfIO $ xs >>= wrapping cClzNaive,
+        bcompare matchLabel . bench cblockLabel . nfIO $ xs >>= wrapping cClzBlock,
+        bcompare matchLabel . bench cunrolledLabel . nfIO $ xs >>= wrapping cClzBlockUnrolled
+      ]
+
+naiveClz :: ByteString -> Int
+naiveClz bs = fromMaybe (BS.length bs * 8) $ do
+  ix <- BS.findIndex (/= zeroBits) bs
+  pure $ ix * 8 + countLeadingZeros (BS.index bs ix)
+
+-- Avoids having to rewrap C ops tediously each time
+wrapping :: (Ptr CUChar -> CSize -> CSize) -> ByteString -> IO Int
+wrapping f bs = unsafeUseAsCStringLen bs $ \(ptr, len) ->
+  pure . fromIntegral . f (castPtr ptr) . fromIntegral $ len
+
+foreign import ccall unsafe "cbits.h c_clz_naive"
+  cClzNaive ::
+    Ptr CUChar ->
+    CSize ->
+    CSize
+
+foreign import ccall unsafe "cbits.h c_clz_block"
+  cClzBlock ::
+    Ptr CUChar ->
+    CSize ->
+    CSize
+
+foreign import ccall unsafe "cbits.h c_clz_block_unrolled"
+  cClzBlockUnrolled ::
+    Ptr CUChar ->
+    CSize ->
+    CSize
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/Popcount.hs b/plutus-core/plutus-core/bench/bitwise/Benches/Popcount.hs
new file mode 100644
index 00000000000..71572160ac5
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/Popcount.hs
@@ -0,0 +1,85 @@
+-- editorconfig-checker-disable-file
+module Benches.Popcount (
+  benches,
+  cBenches
+  ) where
+
+import Data.Bits (popCount)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import DataGen (mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CSize (CSize), CUChar)
+import Foreign.Ptr (Ptr, castPtr)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic popcount" $ benchBasic "Basic popcount" <$> sizes
+
+cBenches :: Benchmark
+cBenches = bgroup "C popcount" $ benchC "C popcount" <$> sizes
+
+-- Helpers
+
+-- Benchmark a naive Haskell implementation against all the C ones
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "foldl'"
+        cnaiveLabel = "naive C"
+        cblockLabel = "block C"
+        cblockUnrollLabel = "block unrolled C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ BS.foldl' (\acc w8 -> acc + popCount w8) 0 <$> xs,
+        bcompare matchLabel . bench cnaiveLabel . nfIO $ xs >>= wrapping cPopcountNaive,
+        bcompare matchLabel . bench cblockLabel . nfIO $ xs >>= wrapping cPopcountBlock,
+        bcompare matchLabel . bench cblockUnrollLabel . nfIO $ xs >>= wrapping cPopcountBlockUnroll
+        ]
+
+-- Benchmark naive C against the other C ones
+benchC ::
+  String ->
+  Int ->
+  Benchmark
+benchC mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let cnaiveLabel = "naive C"
+        cblockLabel = "block C"
+        cblockUnrollLabel = "block unrolled C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> cnaiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench cnaiveLabel . nfIO $ xs >>= wrapping cPopcountNaive,
+        bcompare matchLabel . bench cblockLabel . nfIO $ xs >>= wrapping cPopcountBlock,
+        bcompare matchLabel . bench cblockUnrollLabel . nfIO $ xs >>= wrapping cPopcountBlockUnroll
+        ]
+
+
+-- Avoids having to rewrap C popcount ops tediously each time
+wrapping :: (Ptr CUChar -> CSize -> CSize) -> ByteString -> IO CSize
+wrapping f bs = unsafeUseAsCStringLen bs $ \(ptr, len) ->
+  pure $ f (castPtr ptr) (fromIntegral len)
+
+foreign import ccall unsafe "cbits.h c_popcount_naive"
+  cPopcountNaive ::
+    Ptr CUChar ->
+    CSize ->
+    CSize
+
+foreign import ccall unsafe "cbits.h c_popcount_block"
+  cPopcountBlock ::
+    Ptr CUChar ->
+    CSize ->
+    CSize
+
+foreign import ccall unsafe "cbits.h c_popcount_block_unroll"
+  cPopcountBlockUnroll ::
+    Ptr CUChar ->
+    CSize ->
+    CSize
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/Rotate.hs b/plutus-core/plutus-core/bench/bitwise/Benches/Rotate.hs
new file mode 100644
index 00000000000..5123fb1a01d
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/Rotate.hs
@@ -0,0 +1,87 @@
+module Benches.Rotate (
+  benches,
+  ) where
+
+import Data.Bits (bit, shiftR, testBit, zeroBits, (.|.))
+import Data.Bool (bool)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal (fromForeignPtr, mallocByteString)
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import Data.Foldable (foldl')
+import Data.Word (Word8)
+import DataGen (mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CInt (CInt), CSize (CSize), CUChar)
+import Foreign.ForeignPtr (castForeignPtr, withForeignPtr)
+import Foreign.Ptr (Ptr, castPtr)
+import GHC.Exts (fromList)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic bitwise rotate" $ benchBasic "Basic bitwise rotate" <$> sizes
+
+-- Helpers
+
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "ByteString ops"
+        cLabel = "C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ bitRotate (len * 4) <$> xs,
+        bcompare matchLabel . bench cLabel . nfIO $ bitRotateC (len * 4) <$> xs
+        ]
+
+bitRotate :: Int -> ByteString -> ByteString
+bitRotate i bs
+  | bitLen == 0 = bs
+  | otherwise = case i `rem` bitLen of
+      0 -> bs -- nothing to do
+      j -> fromList $ go j <$> [0 .. BS.length bs - 1]
+  where
+    bitLen :: Int
+    bitLen = BS.length bs * 8
+    go :: Int -> Int -> Word8
+    go j byteIx = let bitIxes = (\ix -> 8 * byteIx - j + ix) <$> [0 .. 7]
+                      bits = bitAtWraparound bs <$> bitIxes
+                      zipped = zip [7, 6 .. 0] bits in
+      foldl' (\acc (pos, b) -> acc .|. bool zeroBits (bit pos) b) zeroBits zipped
+
+bitRotateC :: Int -> ByteString -> ByteString
+bitRotateC i bs
+  | bitLen == 0 = bs
+  | otherwise = case i `rem` bitLen of
+      0 -> bs -- nothing to do
+      j -> unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(src, len) -> do
+        fp <- mallocByteString len
+        withForeignPtr fp $ \dst ->
+          cRotateBits (fromIntegral j) dst (castPtr src) . fromIntegral $ len
+        pure . fromForeignPtr (castForeignPtr fp) 0 $ len
+  where
+    bitLen :: Int
+    bitLen = BS.length bs * 8
+
+bitAtWraparound :: ByteString -> Int -> Bool
+bitAtWraparound bs i
+  | i < 0 = bitAtWraparound bs (i + bitLength)
+  | otherwise = let (bigIx, smallIx) = i `quotRem` 8
+                    byte = BS.index bs bigIx in
+      testBit byte $ shiftR 0x80 smallIx
+  where
+    bitLength :: Int
+    bitLength = BS.length bs * 8
+
+foreign import ccall unsafe "cbits.h c_rotate_bits"
+  cRotateBits ::
+    CInt ->
+    Ptr CUChar ->
+    Ptr CUChar ->
+    CSize ->
+    IO ()
diff --git a/plutus-core/plutus-core/bench/bitwise/Benches/Shift.hs b/plutus-core/plutus-core/bench/bitwise/Benches/Shift.hs
new file mode 100644
index 00000000000..98b204fe697
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Benches/Shift.hs
@@ -0,0 +1,79 @@
+module Benches.Shift (
+  benches,
+  ) where
+
+import Data.Bits (bit, shiftR, testBit, zeroBits, (.|.))
+import Data.Bool (bool)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal (fromForeignPtr, mallocByteString)
+import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
+import Data.Foldable (foldl')
+import Data.Word (Word8)
+import DataGen (mkUnaryArg, noCleanup, sizes)
+import Foreign.C.Types (CInt (CInt), CSize (CSize), CUChar)
+import Foreign.ForeignPtr (castForeignPtr, withForeignPtr)
+import Foreign.Ptr (Ptr, castPtr)
+import GHC.Exts (fromList)
+import System.IO.Unsafe (unsafeDupablePerformIO)
+import Test.Tasty (withResource)
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, nfIO)
+
+benches :: Benchmark
+benches = bgroup "Basic bitwise shift" $ benchBasic "Basic bitwise shift" <$> sizes
+
+-- Helpers
+
+benchBasic ::
+  String ->
+  Int ->
+  Benchmark
+benchBasic mainLabel len =
+  withResource (mkUnaryArg len) noCleanup $ \xs ->
+    let naiveLabel = "ByteString ops"
+        cLabel = "C"
+        testLabel = mainLabel <> ", length " <> show len
+        matchLabel = "$NF == \"" <> naiveLabel <> "\" && $(NF - 1) == \"" <> testLabel <> "\"" in
+      bgroup testLabel [
+        bench naiveLabel . nfIO $ bitShift (len * 4) <$> xs,
+        bcompare matchLabel . bench cLabel . nfIO $ bitShiftC (len * 4) <$> xs
+        ]
+
+bitShift :: Int -> ByteString -> ByteString
+bitShift i bs = case signum i of
+  0 -> bs
+  _ -> fromList $ go <$> [0 .. BS.length bs - 1]
+  where
+    go :: Int -> Word8
+    go byteIx = let bitIxes = (\ix -> 8 * byteIx - i + ix) <$> [0 .. 7]
+                    bits = bitAtClipping bs <$> bitIxes
+                    zipped = zip [7, 6 .. 0] bits in
+      foldl' (\acc (pos, b) -> acc .|. bool zeroBits (bit pos) b) zeroBits zipped
+
+bitShiftC :: Int -> ByteString -> ByteString
+bitShiftC i bs = case signum i of
+  0 -> bs
+  _ -> unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(src, len) -> do
+    fp <- mallocByteString len
+    withForeignPtr fp $ \dst ->
+      cShiftBits (fromIntegral i) dst (castPtr src) . fromIntegral $ len
+    pure . fromForeignPtr (castForeignPtr fp) 0 $ len
+
+bitAtClipping :: ByteString -> Int -> Bool
+bitAtClipping bs i
+  | i < 0 = False
+  | i >= bitLength = False
+  | otherwise = let (bigIx, smallIx) = i `quotRem` 8
+                    byte = BS.index bs bigIx in
+      testBit byte $ shiftR 0x80 smallIx
+  where
+    bitLength :: Int
+    bitLength = BS.length bs * 8
+
+foreign import ccall unsafe "cbits.h c_shift_bits"
+  cShiftBits ::
+    CInt ->
+    Ptr CUChar ->
+    Ptr CUChar ->
+    CSize ->
+    IO ()
diff --git a/plutus-core/plutus-core/bench/bitwise/DataGen.hs b/plutus-core/plutus-core/bench/bitwise/DataGen.hs
new file mode 100644
index 00000000000..21ce2cac7e9
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/DataGen.hs
@@ -0,0 +1,65 @@
+{-# LANGUAGE BangPatterns        #-}
+{-# LANGUAGE KindSignatures      #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module DataGen (
+  mkUnaryArg,
+  mkHomogenousArg,
+  mkBinaryArgs,
+  mkInteger,
+  mkZeroesOne,
+  sizes,
+  noCleanup,
+  ) where
+
+import Control.Monad (replicateM)
+import Control.Monad.State.Strict (State)
+import Data.Bits (unsafeShiftL)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.Kind (Type)
+import Data.Word (Word8)
+import GHC.Exts (fromListN)
+import System.Random.Stateful (StateGenM, StdGen, mkStdGen, randomM, runStateGen_, uniformWord8)
+
+-- Generate an Integer that will require a representation of this many bytes
+mkInteger :: Int -> IO Integer
+mkInteger len = pure . runStateGen_ (mkStdGen 42) $ \gen ->
+  go gen 0 0
+  where
+    go :: StateGenM StdGen -> Int -> Integer -> State StdGen Integer
+    go !gen !place !acc
+      | place == len = pure acc
+      -- we generate non-zero bytes to ensure we don't get truncations
+      | otherwise = do
+          block <- uniformWord8 gen
+          let result = fromIntegral $ if block == 0 then block + 1 else block
+          go gen (place + 1) $ acc + result `unsafeShiftL` (place * 8)
+
+-- Generate a ByteString of a given length
+mkUnaryArg :: Int -> IO ByteString
+mkUnaryArg len = pure . runStateGen_ (mkStdGen 42) $ \gen ->
+  fromListN len <$> replicateM len (randomM gen)
+
+-- Generate a ByteString of a given length full of the given byte
+mkHomogenousArg :: Int -> Word8 -> IO ByteString
+mkHomogenousArg len = pure . BS.replicate len
+
+-- Generates n - 1 zeroes, followed by a one byte
+mkZeroesOne :: Int -> IO ByteString
+mkZeroesOne len = pure $ BS.snoc (BS.replicate (len - 1) 0x00) 0x01
+
+-- Generate two ByteStrings, both of a given length
+mkBinaryArgs :: Int -> IO (ByteString, ByteString)
+mkBinaryArgs len = pure . runStateGen_ (mkStdGen 42) $ \gen ->
+  (,) <$> (fromListN len <$> replicateM len (randomM gen)) <*>
+          (fromListN len <$> replicateM len (randomM gen))
+
+-- We work in IO only to avoid interference, so thus, a cleanup isn't needed for
+-- withResource. This function is designed to indicate that fact.
+noCleanup :: forall (a :: Type) . a -> IO ()
+noCleanup = const (pure ())
+
+-- Basic set of sizes (in bytes)
+sizes :: [Int]
+sizes = [(2 :: Int) ^ (i :: Int) - 1 | i <- [1 .. 15]]
diff --git a/plutus-core/plutus-core/bench/bitwise/Main.hs b/plutus-core/plutus-core/bench/bitwise/Main.hs
new file mode 100644
index 00000000000..3dc911b270c
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/Main.hs
@@ -0,0 +1,55 @@
+-- editorconfig-checker-disable-file
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE KindSignatures    #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Main (main) where
+
+import Benches.Binary qualified as Binary
+import Benches.BitRead qualified as BitRead
+import Benches.BitWrite qualified as BitWrite
+import Benches.Complement qualified as Complement
+import Benches.Convert qualified as Convert
+import Benches.CountLeadingZeroes qualified as CountLeadingZeroes
+import Benches.Popcount qualified as Popcount
+import Benches.Rotate qualified as Rotate
+import Benches.Shift qualified as Shift
+import GHC.IO.Encoding (setLocaleEncoding, utf8)
+import Test.Tasty (testGroup)
+import Test.Tasty.Bench (defaultMain)
+
+main :: IO ()
+main = do
+  setLocaleEncoding utf8
+  defaultMain [
+    testGroup "Popcount" [
+      Popcount.benches,
+      Popcount.cBenches
+      ],
+    testGroup "Complement" [
+      Complement.benches
+      ],
+    testGroup "Binary" [
+      Binary.benches
+      ],
+    testGroup "Count leading zeroes" [
+      CountLeadingZeroes.benches,
+      CountLeadingZeroes.cBenches
+      ],
+    testGroup "Bit read" [
+      BitRead.benches
+      ],
+    testGroup "Bit write" [
+      BitWrite.benches
+      ],
+    testGroup "Bit shift" [
+      Shift.benches
+      ],
+    testGroup "Bit rotate" [
+      Rotate.benches
+      ],
+    testGroup "Conversions" [
+      Convert.benchesBSToI,
+      Convert.benchesIToBS
+      ]
+    ]
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/binary.c b/plutus-core/plutus-core/bench/bitwise/cbits/binary.c
new file mode 100644
index 00000000000..aee1a1b5f83
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/binary.c
@@ -0,0 +1,8 @@
+#include "cbits.h"
+
+void c_and_naive(unsigned char *dst, unsigned char const *src1,
+                 unsigned char const *src2, size_t const len) {
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = src1[i] & src2[i];
+  }
+}
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/bit-access.c b/plutus-core/plutus-core/bench/bitwise/cbits/bit-access.c
new file mode 100644
index 00000000000..41b6383cb12
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/bit-access.c
@@ -0,0 +1,37 @@
+#include "cbits.h"
+#include <string.h>
+
+bool c_bit_at(size_t const ix, unsigned char *const restrict src) {
+  size_t big_ix = ix / 8;
+  size_t small_ix = ix % 8;
+  return src[big_ix] & (0x80 >> small_ix);
+}
+
+void c_bit_set_naive(bool const b, size_t const ix, unsigned char *restrict dst,
+                     unsigned char const *restrict src, size_t const len) {
+  size_t big_ix = ix / 8;
+  size_t small_ix = ix % 8;
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = src[i];
+  }
+  if (b == true) {
+    dst[big_ix] = src[big_ix] | (0x80 >> small_ix);
+  } else {
+    dst[big_ix] = src[big_ix] & (~(0x80 >> small_ix));
+  }
+}
+
+void c_bit_set_memcpy(bool const b, size_t const ix,
+                      unsigned char *restrict dst,
+                      unsigned char const *restrict src, size_t const len) {
+  size_t big_ix = ix / 8;
+  size_t small_ix = ix % 8;
+  // Copy entirety of src
+  memcpy(dst, src, len);
+  // Set our desired bit
+  if (b == true) {
+    dst[big_ix] = src[big_ix] | (0x80 >> small_ix);
+  } else {
+    dst[big_ix] = src[big_ix] & (~(0x80 >> small_ix));
+  }
+}
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/cbits.h b/plutus-core/plutus-core/bench/bitwise/cbits/cbits.h
new file mode 100644
index 00000000000..8c6f025f484
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/cbits.h
@@ -0,0 +1,54 @@
+#ifndef CBITS_H
+#define CBITS_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+// Popcount
+
+size_t c_popcount_naive(unsigned char const *restrict src, size_t const len);
+
+size_t c_popcount_block(unsigned char const *restrict src, size_t len);
+
+size_t c_popcount_block_unroll(unsigned char const *restrict src, size_t len);
+
+// Complement
+
+void c_complement_naive(unsigned char *restrict dst,
+                        unsigned char const *restrict src, size_t const len);
+
+// Binary ops
+
+void c_and_naive(unsigned char *restrict dst, unsigned char const *src1,
+                 unsigned char const *src2, size_t const len);
+
+// Bit reading and writing
+
+bool c_bit_at(size_t const ix, unsigned char *const restrict src);
+
+void c_bit_set_naive(bool const b, size_t const ix, unsigned char *restrict dst,
+                     unsigned char const *restrict src, size_t const len);
+
+void c_bit_set_memcpy(bool const b, size_t const ix,
+                      unsigned char *restrict dst,
+                      unsigned char const *restrict src, size_t const len);
+
+// CLZ
+
+size_t c_clz_naive(unsigned char const *restrict src, size_t const len);
+
+size_t c_clz_block(unsigned char const *restrict src, size_t len);
+
+size_t c_clz_block_unrolled(unsigned char const *restrict src, size_t len);
+
+// Shift
+
+void c_shift_bits(int bit_shift, unsigned char *restrict dst,
+                  unsigned char const *restrict src, size_t len);
+
+// Rotate
+
+void c_rotate_bits(int bit_rotation, unsigned char *restrict dst,
+                   unsigned char const *restrict src, size_t len);
+
+#endif /* CBITS_H */
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/clz.c b/plutus-core/plutus-core/bench/bitwise/cbits/clz.c
new file mode 100644
index 00000000000..9941c8da3a0
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/clz.c
@@ -0,0 +1,70 @@
+#include "cbits.h"
+#include <stdint.h>
+
+size_t c_clz_naive(unsigned char const *src, size_t const len) {
+  size_t leading_zeroes = 0;
+  for (size_t i = 0; i < len; i++) {
+    // Necessary because __builtin_clz has an undefined outcome if its argument
+    // is zero.
+    if (src[i] != 0) {
+      // This is necessary because GCC will sign-extend the ith byte whether we
+      // like it or not. Thus, we have to compensate.
+      size_t offset = (sizeof(unsigned int) - 1) * 8;
+      return leading_zeroes + (__builtin_clz(src[i]) - offset);
+    }
+    leading_zeroes += 8;
+  }
+  return leading_zeroes;
+}
+
+size_t c_clz_block(unsigned char const *restrict src, size_t len) {
+  size_t leading_zeroes = 0;
+  while (len >= sizeof(unsigned long long)) {
+    unsigned long long x = *((unsigned long long const *restrict)src);
+    if (x != 0) {
+      return leading_zeroes + __builtin_clzll(x);
+    }
+    leading_zeroes += (sizeof(unsigned long long) * 8);
+    src += sizeof(unsigned long long);
+    len -= sizeof(unsigned long long);
+  }
+  while (len > 0) {
+    if ((*src) != 0) {
+      // Same necessity as before
+      size_t offset = (sizeof(unsigned int) - 1) * 8;
+      return leading_zeroes + (__builtin_clz(*src) - offset);
+    }
+    leading_zeroes += 8;
+    src++;
+    len--;
+  }
+  return leading_zeroes;
+}
+
+size_t c_clz_block_unrolled(unsigned char const *restrict src, size_t len) {
+  size_t leading_zeroes = 0;
+  while (len >= 2 * sizeof(unsigned long long)) {
+    unsigned long long x = ((unsigned long long const *restrict)src)[0];
+    unsigned long long y = ((unsigned long long const *restrict)src)[1];
+    if (x != 0) {
+      return leading_zeroes + __builtin_clzll(x);
+    }
+    if (y != 0) {
+      return leading_zeroes + sizeof(unsigned long long) + __builtin_clzll(y);
+    }
+    leading_zeroes += (sizeof(unsigned long long) * 16);
+    src += 2 * sizeof(unsigned long long);
+    len -= 2 * sizeof(unsigned long long);
+  }
+  while (len > 0) {
+    if ((*src) != 0) {
+      // Same necessity as before
+      size_t offset = (sizeof(unsigned int) - 1) * 8;
+      return leading_zeroes + (__builtin_clz(*src) - offset);
+    }
+    leading_zeroes += 8;
+    src++;
+    len--;
+  }
+  return leading_zeroes;
+}
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/complement.c b/plutus-core/plutus-core/bench/bitwise/cbits/complement.c
new file mode 100644
index 00000000000..3395957de6d
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/complement.c
@@ -0,0 +1,9 @@
+#include "cbits.h"
+#include <stdint.h>
+
+void c_complement_naive(unsigned char *restrict dst,
+                        unsigned char const *restrict src, size_t const len) {
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = ~(src[i]);
+  }
+}
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/popcount.c b/plutus-core/plutus-core/bench/bitwise/cbits/popcount.c
new file mode 100644
index 00000000000..0bcf714c8af
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/popcount.c
@@ -0,0 +1,64 @@
+#include "cbits.h"
+#include <stdint.h>
+
+size_t c_popcount_naive(unsigned char const *restrict src, size_t const len) {
+  size_t total = 0;
+  for (size_t i = 0; i < len; i++) {
+    total += __builtin_popcount(src[i]);
+  }
+  return total;
+}
+
+/*
+ * We take advantage of the fact that a single POPCNT instruction can count an
+ * entire register's worth of bits, rather than a single byte. To aid GCC in
+ * doing this, we do a classic strip mine: first count at unsigned long long
+ * width, then finish off with byte-at-a-time.
+ *
+ * Strip mining:
+ * http://physics.ujep.cz/~zmoravec/prga/main_for/mergedProjects/optaps_for/common/optaps_vec_mine.htm
+ */
+size_t c_popcount_block(unsigned char const *restrict src, size_t len) {
+  size_t total = 0;
+  while (len >= sizeof(unsigned long long)) {
+    total += __builtin_popcountll(*(unsigned long long const *restrict)src);
+    src += sizeof(unsigned long long);
+    len -= sizeof(unsigned long long);
+  }
+  while (len > 0) {
+    total += __builtin_popcount(*src);
+    src++;
+    len--;
+  }
+  return total;
+}
+
+/*
+ * We further extend the popcount_block method by manually two-way unrolling
+ * the loop. This can take advantage of high throughput for the POPCNT
+ * instruction on modern x86 CPUs, as they can issue four POPCNT instructions
+ * simultaneously if data is available.
+ *
+ * Loop unrolling:
+ * https://en.wikipedia.org/wiki/Loop_unrolling#Static/manual_loop_unrolling
+ * Instruction tables for x86:
+ * https://www.agner.org/optimize/instruction_tables.pdf ILP (including
+ * multiple-issue): https://en.wikipedia.org/wiki/Instruction-level_parallelism
+ * Data dependency: https://en.wikipedia.org/wiki/Data_dependency
+ */
+size_t c_popcount_block_unroll(unsigned char const *restrict src, size_t len) {
+  size_t total = 0;
+  while (len >= 2 * sizeof(unsigned long long)) {
+    total += __builtin_popcountll(*(unsigned long long const *restrict)src);
+    total += __builtin_popcountll(*(
+        unsigned long long const *restrict)(src + sizeof(unsigned long long)));
+    src += 2 * sizeof(unsigned long long);
+    len -= 2 * sizeof(unsigned long long);
+  }
+  while (len > 0) {
+    total += __builtin_popcount(*src);
+    src++;
+    len--;
+  }
+  return total;
+}
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/rotate.c b/plutus-core/plutus-core/bench/bitwise/cbits/rotate.c
new file mode 100644
index 00000000000..df82cf660ab
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/rotate.c
@@ -0,0 +1,25 @@
+#include "cbits.h"
+
+void c_rotate_bits(int bit_rotation, unsigned char *restrict dst,
+                   unsigned char const *restrict src, size_t len) {
+  if (bit_rotation > 0) {
+    size_t read_pos = bit_rotation / 8;
+    size_t bit_tail_len = bit_rotation % 8;
+    size_t write_pos = 0;
+    unsigned char const tail_mask = (0x01 << bit_tail_len) - 1;
+    unsigned char const head_mask = ~tail_mask;
+    while (write_pos < len) {
+      if (read_pos == len - 1) {
+        dst[write_pos] = (head_mask & src[read_pos]) | (tail_mask & src[0]);
+        read_pos = 0;
+      } else {
+        dst[write_pos] =
+            (head_mask & src[read_pos]) | (tail_mask & src[read_pos + 1]);
+        read_pos++;
+      }
+      write_pos++;
+    }
+  } else {
+    c_rotate_bits((len * 8) + bit_rotation, dst, src, len);
+  }
+}
diff --git a/plutus-core/plutus-core/bench/bitwise/cbits/shift.c b/plutus-core/plutus-core/bench/bitwise/cbits/shift.c
new file mode 100644
index 00000000000..9b50d18d689
--- /dev/null
+++ b/plutus-core/plutus-core/bench/bitwise/cbits/shift.c
@@ -0,0 +1,59 @@
+#include "cbits.h"
+#include <stdlib.h>
+#include <string.h>
+
+void c_shift_bits(int bit_shift, unsigned char *restrict dst,
+                  unsigned char const *restrict src, size_t len) {
+  if (bit_shift > 0) {
+    size_t infill_bytes = bit_shift / 8;
+    size_t const bit_head_len = bit_shift % 8;
+    if (infill_bytes > len) {
+      infill_bytes = len;
+    }
+    memset(dst, 0x00, infill_bytes);
+    if (bit_head_len == 0) {
+      memcpy(dst + infill_bytes, src, len - infill_bytes);
+    } else {
+      size_t read_pos = 0;
+      size_t write_pos = infill_bytes;
+      unsigned char const hi_mask = (0x01 << bit_head_len) - 1;
+      unsigned char const lo_mask = ~hi_mask;
+      while (write_pos < len) {
+        if (read_pos == 0) {
+          dst[write_pos] = hi_mask & src[read_pos];
+        } else {
+          dst[write_pos] =
+              (lo_mask & src[read_pos - 1]) | (hi_mask & src[read_pos]);
+        }
+        write_pos++;
+        read_pos++;
+      }
+    }
+  } else {
+    size_t const abs_bit_shift = abs(bit_shift);
+    size_t infill_bytes = abs_bit_shift / 8;
+    size_t const bit_tail_len = abs_bit_shift % 8;
+    if (infill_bytes > len) {
+      infill_bytes = len;
+    }
+    if (bit_tail_len == 0) {
+      memcpy(dst, src + infill_bytes, len - infill_bytes);
+    } else {
+      size_t read_pos = infill_bytes;
+      size_t write_pos = 0;
+      unsigned char const hi_mask = (0x01 << bit_tail_len) - 1;
+      unsigned char const lo_mask = ~hi_mask;
+      while (read_pos < len) {
+        if (read_pos == (len - 1)) {
+          dst[write_pos] = lo_mask & src[read_pos];
+        } else {
+          dst[write_pos] =
+              (lo_mask & src[read_pos]) | (hi_mask & src[read_pos + 1]);
+        }
+        write_pos++;
+        read_pos++;
+      }
+    }
+    memset(dst + (len - infill_bytes), 0x00, infill_bytes);
+  }
+}
diff --git a/plutus-core/plutus-core/src/Bitwise.hs b/plutus-core/plutus-core/src/Bitwise.hs
new file mode 100644
index 00000000000..c442ca14ae7
--- /dev/null
+++ b/plutus-core/plutus-core/src/Bitwise.hs
@@ -0,0 +1,500 @@
+-- editorconfig-checker-disable-file
+{-# LANGUAGE ImpredicativeTypes #-}
+{-# LANGUAGE KindSignatures     #-}
+{-# LANGUAGE LambdaCase         #-}
+{-# LANGUAGE MagicHash          #-}
+{-# LANGUAGE MultiWayIf         #-}
+{-# LANGUAGE OverloadedStrings  #-}
+{-# LANGUAGE TupleSections      #-}
+{-# LANGUAGE TypeApplications   #-}
+{-# LANGUAGE UnboxedSums        #-}
+{-# OPTIONS_GHC -fexpose-all-unfoldings #-}
+
+-- FIXME: Should be its own library
+module Bitwise (
+  integerToByteString,
+  byteStringToInteger,
+  andByteString,
+  iorByteString,
+  xorByteString,
+  complementByteString,
+  popCountByteString,
+  testBitByteString,
+  writeBitByteString,
+  findFirstSetByteString,
+  shiftByteString,
+  rotateByteString,
+  ) where
+
+import Data.Bits (FiniteBits, bit, complement, popCount, rotate, shift, shiftL, xor, zeroBits,
+                  (.&.), (.|.))
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal (toForeignPtr0)
+import Data.ByteString.Short qualified as SBS
+import Data.ByteString.Unsafe (unsafePackMallocCStringLen, unsafeUseAsCString,
+                               unsafeUseAsCStringLen)
+import Data.Foldable (foldl', for_)
+import Data.Functor (void)
+import Data.Kind (Type)
+import Data.Text (Text, pack)
+import Data.Word (Word64, Word8)
+import Foreign.C.Types (CChar, CSize)
+import Foreign.Marshal.Alloc (mallocBytes)
+import Foreign.Ptr (Ptr, castPtr, plusPtr)
+import Foreign.Storable (Storable (peek, poke, sizeOf))
+import GHC.ForeignPtr (ForeignPtr (ForeignPtr))
+import GHC.IO.Handle.Text (memcpy)
+import GHC.Num.Integer (Integer (IN), integerFromAddr, integerToBigNatClamp#)
+import GHC.Prim (int2Word#)
+import GHC.Types (Int (I#))
+import PlutusCore.Builtin.Emitter (Emitter, emit)
+import PlutusCore.Evaluation.Result (EvaluationResult (EvaluationFailure))
+import System.IO.Unsafe (unsafeDupablePerformIO)
+
+-- | See 'PlutusTx.Builtins.rotateByteString'.
+rotateByteString :: ByteString -> Integer -> ByteString
+rotateByteString bs i
+  -- If a ByteString is completely homogenous, rotating won't change it. This
+  -- also covers emptiness, since empty ByteStrings are homogenous vacuously.
+  | isAllZero bs || isAllOne bs = bs
+  -- Rotating by more than the number of bits in a ByteString 'wraps around',
+  -- so we're only interested in the rotation modulo the number of bits.
+  | otherwise = case i `mod` bitLen of
+            -- Means we have a multiple of the bit count, so nothing to do.
+            0            -> bs
+            displacement -> overPtrLen bs $ \ptr len -> go ptr len displacement
+  where
+    -- not recursive!
+    go :: Ptr Word8 -> Int -> Integer -> IO (Ptr Word8)
+    go src len displacement = do
+      dst <- mallocBytes len
+      case len of
+        -- If we only have one byte, we can borrow from the Bits instance for
+        -- Word8, since it rotates in the same direction that we want.
+        1 -> do
+          srcByte <- peek src
+          let srcByte' = srcByte `rotate` fromIntegral displacement
+          poke dst srcByte'
+        -- If we rotate by a multiple of 8, we only need to move around whole
+        -- bytes, rather than individual bits. Because we only move contiguous
+        -- blocks (regardless of rotation direction), we can do this using
+        -- memcpy, which is much faster, especially on larger ByteStrings.
+        _ -> case displacement `quotRem` 8 of
+          (bigMove, 0) -> do
+            let mainLen :: CSize = fromIntegral $ bigMove
+            let restLen :: CSize = fromIntegral len - mainLen
+            -- Copy the portion [..mainLen] to [restLen..],
+            -- and the portion [mainLen..] to [..restLen].
+            _ <- memcpy (plusPtr dst (fromIntegral restLen)) src mainLen
+            _ <- memcpy dst (plusPtr src (fromIntegral mainLen)) restLen
+            pure ()
+          -- If we don't rotate by a multiple of 8, we have to construct new
+          -- bytes, rather than just copying over old ones. We do this in two
+          -- steps:
+          --
+          -- 1. Compute the 'read offset' into the source ByteString based on
+          -- the rotation magnitude and direction.
+          -- 2. Use that read offset to perform an (unchecked) bit lookup for an
+          -- entire 8-bit block, then construct the byte that results.
+          --
+          -- We can do the bytes in the result in any order using this method:
+          -- we choose to do it in traversal order.
+          _ -> for_ [0 .. len - 1] $ \j -> do
+                let start = (len - 1 - j) * 8
+                let dstByte = foldl' (addBit start displacement) zeroBits [0 .. 7]
+                poke (plusPtr dst j) dstByte
+      pure dst
+    bitLen :: Integer
+    bitLen = fromIntegral $ BS.length bs * 8
+    addBit :: Int -> Integer -> Word8 -> Integer -> Word8
+    addBit start displacement acc offset =
+      let oldIx = (offset + fromIntegral start + bitLen - displacement) `rem` bitLen in
+        if dangerousRead bs oldIx
+        then acc .|. (bit . fromIntegral $ offset)
+        else acc
+
+-- | See 'PlutusTx.Builtins.shiftByteString.
+shiftByteString :: ByteString -> Integer -> ByteString
+shiftByteString bs i
+  -- Shifting by the number of bits, or more, would zero everything anyway,
+  -- regardless of direction. This also covers the empty ByteString case, as its
+  -- bit length is zero.
+  | abs i >= bitLen = BS.replicate (BS.length bs) zeroBits
+  -- Shifting an all-zero ByteString will not change it, regardless of
+  -- direction.
+  | isAllZero bs = bs
+  | otherwise = overPtrLen bs go
+  where
+    bitLen :: Integer
+    bitLen = fromIntegral $ BS.length bs * 8
+    go :: Ptr Word8 -> Int -> IO (Ptr Word8)
+    go src len = do
+      dst <- mallocBytes len
+      case len of
+        -- If we only have one byte, we can borrow from the Bits instance for
+        -- Word8, since it shifts in the same direction that we want.
+        1 -> do
+          srcByte <- peek src
+          let srcByte' = srcByte `shift` fromIntegral i
+          poke dst srcByte'
+        -- If we shift by a multiple of 8, we only need to move a contiguous
+        -- block of bytes, then clear what remains. This is much more efficient:
+        -- it would be nice if we had memset available, but at least the copy
+        -- can be done with memcpy.
+        _ -> case i `quotRem` 8 of
+          (bigMove, 0) -> do
+            let mainLen :: CSize = fromIntegral . abs $ bigMove
+            let restLen :: CSize = fromIntegral len - mainLen
+            case signum bigMove of
+              1 -> do
+                void . memcpy dst (plusPtr src . fromIntegral $ mainLen) $ restLen
+                for_ [fromIntegral restLen, fromIntegral $ restLen + 1 .. len - 1] $ \j ->
+                    poke @Word8 (plusPtr dst j) zeroBits
+              _ -> do
+                for_ [0 .. fromIntegral mainLen - 1] $ \j -> poke @Word8 (plusPtr dst j) zeroBits
+                void . memcpy (plusPtr dst . fromIntegral $ mainLen) src $ restLen
+          -- If we shift by something other than a multiple of 8, we have to
+          -- construct new bytes, similarly to rotations. We use the same
+          -- two-step process to construct new bytes, but due to not having the
+          -- 'wraparound' behaviour (unlike rotations), we clear any bits that
+          -- would be sourced 'out of bounds'.
+          _ -> for_ [0 .. len - 1] $ \j -> do
+                let start = (len - 1 - j) * 8
+                let dstByte = foldl' (addBit start) zeroBits [0 .. 7]
+                poke (plusPtr dst j) dstByte
+      pure dst
+    addBit :: Int -> Word8 -> Integer -> Word8
+    addBit start acc offset =
+      let possibleIx = offset + fromIntegral start - i in
+        if | possibleIx < 0              -> acc
+           | possibleIx >= bitLen        -> acc
+           | dangerousRead bs possibleIx -> acc .|. (bit . fromIntegral $ offset)
+           | otherwise                   -> acc
+
+-- | See 'PlutusTx.Builtins.findFirstSetByteString'.
+findFirstSetByteString :: ByteString -> Integer
+findFirstSetByteString bs = foldl' go (-1) [0 .. len - 1]
+  where
+    go :: Integer -> Int -> Integer
+    go acc ix
+      | acc /= (-1) = acc -- we found one already
+      | otherwise = case BS.index bs (len - ix - 1) of
+          0  -> (-1) -- keep looking, nothing to find here
+          w8 -> fromIntegral $ (ix * 8) + findPosition w8
+    len :: Int
+    len = BS.length bs
+
+-- | See 'PlutusTx.Builtins.testBitByteString.
+testBitByteString :: ByteString -> Integer -> Emitter (EvaluationResult Bool)
+testBitByteString bs i
+  | i < 0 || i >= bitLen = indexOutOfBoundsError "testBitByteString" bitLen i
+  | otherwise = pure . pure . dangerousRead bs $ i
+  where
+    bitLen :: Integer
+    bitLen = fromIntegral $ BS.length bs * 8
+
+-- | See 'PlutusTx.Builtins.writeBitByteString.
+writeBitByteString :: ByteString -> Integer -> Bool -> Emitter (EvaluationResult ByteString)
+writeBitByteString bs i b
+  | i < 0 || i >= bitLen = indexOutOfBoundsError "writeBitByteString" bitLen i
+  -- When we write a bit at a location, we have to return a new copy of the
+  -- original with the bit modified. We do this as follows:
+  --
+  -- 1. Compute the byte that has to change. Because _byte_ indexes and _bit_
+  -- indexes go in opposite directions, we have to compute the byte by a
+  -- combination of modulus and offset from the end.
+  -- 2. Use the remainder to construct a mask which 'selects' the bit within the
+  -- byte we want to change.
+  -- 3. Memcpy everything over.
+  -- 4. Use the mask at the computed byte index to modify the result in-place:
+  -- we do a different operation depending on whether we're setting or clearing.
+  --
+  -- We use memcpy plus a single write as this is _much_ faster than going
+  -- byte-by-byte and checking if we've reached the index we want each time:
+  -- memcpy is highly-optimized using SIMD instructions on every platform, and a
+  -- branchy per-byte loop is absolutely horrid everywhere for speed due to the
+  -- branch count.
+  | otherwise = do
+      let (bigOffset, smallOffset) = i `quotRem` 8
+      let bigIx = fromIntegral $ byteLen - bigOffset - 1
+      let mask = bit 0 `shiftL` fromIntegral smallOffset
+      pure . pure . unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ go bigIx mask
+  where
+    byteLen :: Integer
+    byteLen = fromIntegral . BS.length $ bs
+    bitLen :: Integer
+    bitLen = byteLen * 8
+    go :: Int -> Word8 -> (Ptr CChar, Int) -> IO ByteString
+    go bigIx mask (src, len) = do
+      dst <- mallocBytes len
+      void . memcpy dst src . fromIntegral $ len
+      byte :: Word8 <- peek . plusPtr src $ bigIx
+      let byte' = if b then mask .|. byte else complement mask .&. byte
+      poke (castPtr . plusPtr dst $ bigIx) byte'
+      unsafePackMallocCStringLen (dst, len)
+
+-- | See 'PlutusTx.Builtins.integerToByteString.
+{-# INLINE integerToByteString #-}
+integerToByteString :: Integer -> Maybe ByteString
+integerToByteString (IN _) = Nothing
+integerToByteString n      = Just $ fst $ BS.spanEnd (== 0) $ SBS.fromShort $ SBS.SBS (integerToBigNatClamp# n)
+
+-- | See 'PlutusTx.Builtins.byteStringToInteger.
+{-# INLINE byteStringToInteger #-}
+byteStringToInteger :: ByteString -> Integer
+byteStringToInteger bs =
+  case toForeignPtr0 bs of
+    (ForeignPtr addr _, I# len) -> unsafeDupablePerformIO $ integerFromAddr (int2Word# len) addr (case 0 of I# n -> n)
+
+-- | See 'PlutusTx.Builtins.popCountByteString.
+popCountByteString :: ByteString -> Integer
+popCountByteString bs = unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ go
+  where
+    -- We use a standard 'big step, small step' approach. The reason for this is
+    -- that bit counting (via a FiniteBits instance) is defined for much larger
+    -- types than Word8. We can thus read 8-blocks of bytes as 64-bit words
+    -- instead (as we don't care about sign and GHC ensures word alignment),
+    -- which gives us potentially up to an 8x speedup.
+    --
+    -- Thus, our 'big step, small step' approach first walks as much of its
+    -- input as it can using steps whose size is Word64, then finishes the job
+    -- with steps whose size is Word8. We use a rank-2 polymorphic method to
+    -- avoid code duplication, since the only operation we need comes from a
+    -- type class, and is thus agnostic to what we're working over. Step size
+    -- can also be determined via Storable in a similar way.
+    go :: (Ptr CChar, Int) -> IO Integer
+    go (ptr, len) = do
+      let (bigSteps, smallSteps) = len `quotRem` 8
+      let bigPtr :: Ptr Word64 = castPtr ptr
+      let smallPtr :: Ptr Word8 = castPtr . plusPtr ptr $ bigSteps * 8
+      bigCount <- countBits bigPtr bigSteps
+      smallCount <- countBits smallPtr smallSteps
+      pure . fromIntegral $ bigCount + smallCount
+
+-- We use a standard 'big step, small step' construction for all the operators
+-- below. The reason for this is that each of these operations are bit-parallel:
+-- it doesn't matter what width of bit block you operate on, you'll have the
+-- same outcome. As a result, these operations are defined for much larger
+-- blocks than Word8. We can thus read 8-blocks of bytes as 64-bit words instead
+-- (as we don't care about sign and GHC ensures word alignment), which gives us
+-- potentially up to an 8x speedup.
+--
+-- Thus, our 'big step, small step' approach processes the inputs in two stages:
+--
+-- 1. Walk lockstep in blocks of Word64 size over both inputs, and set the
+-- corresponding place in the output to the result of the bitwise operation on
+-- those blocks.
+-- 2. For whatever remains, walk lockstep in blocks of Word8 size over both
+-- inputs, and set the corresponding place in the output to the result of the
+-- bitwise operation on those blocks.
+--
+-- We use a rank-2 polymorphic method to avoid code duplication, since all of
+-- the operations over blocks we are interested in (of either size) come from a
+-- type class (Bits) without caring about what specific type we're dealing with.
+-- Step size can also be determined via Storable in a similar way.
+--
+-- We use a mutable construction inside IO instead of something immutable to
+-- avoid excessive 'sloshing': on our current version of the 'bytestring'
+-- library, there is no way to 'zip together' two ByteStrings directly: your
+-- only option was to 'zip out' into a list, then rebuild. This is not only
+-- inefficient (as you can't do a 'big step, little step' approach to this in
+-- general), it also copies too much.
+-- | See 'PlutusTx.Builtins.andByteString.
+andByteString :: ByteString -> ByteString -> Emitter (EvaluationResult ByteString)
+andByteString bs bs'
+  | BS.length bs /= BS.length bs' = mismatchedLengthError "andByteString" bs bs'
+  | otherwise = pure . pure . unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(ptr, len) ->
+      unsafeUseAsCString bs' $ \ptr' ->
+        zipBuild (.&.) ptr ptr' len >>= (unsafePackMallocCStringLen . (,len))
+
+-- | See 'PlutusTx.Builtins.iorByteString.
+iorByteString :: ByteString -> ByteString -> Emitter (EvaluationResult ByteString)
+iorByteString bs bs'
+  | BS.length bs /= BS.length bs' = mismatchedLengthError "iorByteString" bs bs'
+  | otherwise = pure . pure . unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(ptr, len) ->
+      unsafeUseAsCString bs' $ \ptr' ->
+        zipBuild (.|.) ptr ptr' len >>= (unsafePackMallocCStringLen . (,len))
+
+-- | See 'PlutusTx.Builtins.xorByteString.
+xorByteString :: ByteString -> ByteString -> Emitter (EvaluationResult ByteString)
+xorByteString bs bs'
+  | BS.length bs /= BS.length bs' = mismatchedLengthError "xorByteString" bs bs'
+  | otherwise = pure . pure . unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(ptr, len) ->
+      unsafeUseAsCString bs' $ \ptr' ->
+        zipBuild xor ptr ptr' len >>= (unsafePackMallocCStringLen . (,len))
+
+-- Similarly to the above, we use a 'big step, little step' here as well. The
+-- only difference is that there is only one input to read from, rather than
+-- two. Similar reasoning applies to why we made this choice as to the
+-- previous operations.
+
+-- | See 'PlutusTx.Builtins.complementByteString.
+complementByteString :: ByteString -> ByteString
+complementByteString bs = unsafeDupablePerformIO . unsafeUseAsCStringLen bs $ \(ptr, len) -> do
+  resPtr <- mallocBytes len
+  let (bigSteps, smallSteps) = len `quotRem` 8
+  let bigDst :: Ptr Word64 = castPtr resPtr
+  let smallDst :: Ptr Word8 = castPtr . plusPtr resPtr $ bigSteps * 8
+  let bigSrc :: Ptr Word64 = castPtr ptr
+  let smallSrc :: Ptr Word8 = castPtr . plusPtr ptr $ bigSteps * 8
+  go bigDst bigSrc 0 bigSteps
+  go smallDst smallSrc 0 smallSteps
+  unsafePackMallocCStringLen (resPtr, len)
+  where
+    go :: forall (a :: Type) .
+      (Storable a, FiniteBits a) =>
+      Ptr a -> Ptr a -> Int -> Int -> IO ()
+    go dst src offset lim
+      | offset == lim = pure ()
+      | otherwise = do
+          let offset' = offset * sizeOf (undefined :: a)
+          block :: a <- peek . plusPtr src $ offset'
+          poke (plusPtr dst offset') . complement $ block
+          go dst src (offset + 1) lim
+
+-- Helpers
+
+-- We compute the read similarly to how we determine the change when we write.
+-- The only difference is that the mask is used on the input to read it, rather
+-- than to modify anything.
+dangerousRead :: ByteString -> Integer -> Bool
+dangerousRead bs i =
+  let (bigOffset, smallOffset) = i `quotRem` 8
+      bigIx = BS.length bs - fromIntegral bigOffset - 1
+      mask = bit (fromIntegral smallOffset) in
+    case mask .&. BS.index bs bigIx of
+      0 -> False
+      _ -> True
+
+-- Important note: this function is only safe under the following conditions:
+--
+-- * The IO used in the function argument only performs memory allocations using
+-- malloc, as well as reads and writes via the Storable interface;
+-- * The pointer argument is only read from, not written to;
+-- * The result of the function argument points to freshly-allocated, malloced
+-- memory; and
+-- * The result of the function argument points to memory whose length matches
+-- that of the input ByteString (in bytes)
+--
+-- Even though a ByteString is represented as Ptr CChar, we can ignore sign (we
+-- only treat them as binary data anyway), and on POSIX platforms (which GHC
+-- silently assumes, even on Windows), CChar _must_ be exactly a byte. Thus, we
+-- allow working over a pointer to Word8 instead, to avoid issues with signs.
+overPtrLen :: ByteString -> (Ptr Word8 -> Int -> IO (Ptr Word8)) -> ByteString
+overPtrLen bs f = unsafeDupablePerformIO . unsafeUseAsCStringLen bs $
+  \(ptr, len) -> f (castPtr ptr) len >>= \p ->
+    unsafePackMallocCStringLen (castPtr p, len)
+
+-- Error used when lengths of inputs aren't equal.
+mismatchedLengthError :: forall (a :: Type) .
+  Text ->
+  ByteString ->
+  ByteString ->
+  Emitter (EvaluationResult a)
+mismatchedLengthError loc bs bs' = do
+  emit $ loc <> " failed"
+  emit "Reason: mismatched argument lengths"
+  emit $ "Length of first argument: " <> (pack . show . BS.length $ bs)
+  emit $ "Length of second argument: " <> (pack . show . BS.length $ bs')
+  pure EvaluationFailure
+
+-- Error used when an out of bounds index is used to index a bytestring.
+indexOutOfBoundsError :: forall (a :: Type) .
+  Text ->
+  Integer ->
+  Integer ->
+  Emitter (EvaluationResult a)
+indexOutOfBoundsError loc lim i = do
+  emit $ loc <> " failed"
+  emit "Reason: out of bounds"
+  emit $ "Attempted access at index " <> (pack . show $ i)
+  emit $ "Valid indexes: from 0 to " <> (pack . show $ lim - 1)
+  pure EvaluationFailure
+
+-- A general method for 'zipping together' two ByteString inputs to produce a
+-- new ByteString output, assuming the 'zipping function' is bit-parallel. This
+-- uses a standard 'big step, little step' construction. We can do this because
+-- bit-parallel operations don't change semantics based on the size of the block
+-- read; furthermore, as GHC guarantees word alignment and we don't care about
+-- sign, we can potentially get up to an 8x speedup.
+--
+-- We use a mutable construction inside IO instead of something immutable to
+-- avoid excessive 'sloshing': on our current version of the 'bytestring'
+-- library, there is no way to 'zip together' two ByteStrings directly: your
+-- only option was to 'zip out' into a list, then rebuild. This is not only
+-- inefficient (as you can't do a 'big step, little step' approach to this in
+-- general), it also copies too much.
+--
+-- Note: the function argument must be bit-parallel. The type guarantees it to
+-- some degree, but in general, we can't enforce this in the type system.
+zipBuild ::
+  (forall (a :: Type) . (FiniteBits a, Storable a) => a -> a -> a) ->
+  Ptr CChar ->
+  Ptr CChar ->
+  Int ->
+  IO (Ptr CChar)
+zipBuild f ptr ptr' len = do
+  resPtr <- mallocBytes len
+  let (bigSteps, smallSteps) = len `quotRem` 8
+  let bigPtr :: Ptr Word64 = castPtr resPtr
+  let smallPtr :: Ptr Word8 = castPtr . plusPtr resPtr $ bigSteps * 8
+  go bigPtr (castPtr ptr) (castPtr ptr') 0 bigSteps
+  let ptrRest :: Ptr Word8 = castPtr . plusPtr ptr $ bigSteps * 8
+  let ptrRest' :: Ptr Word8 = castPtr . plusPtr ptr' $ bigSteps * 8
+  go smallPtr ptrRest ptrRest' 0 smallSteps
+  pure resPtr
+  where
+    go :: forall (b :: Type) .
+      (FiniteBits b, Storable b) =>
+      Ptr b ->
+      Ptr b ->
+      Ptr b ->
+      Int ->
+      Int ->
+      IO ()
+    go dst src src' offset lim
+      | offset == lim = pure ()
+      | otherwise = do
+          let offset' = sizeOf (undefined :: b) * offset
+          block :: b <- peek . plusPtr src $ offset'
+          block' :: b <- peek . plusPtr src' $ offset'
+          poke (plusPtr dst offset') (f block block')
+          go dst src src' (offset + 1) lim
+
+-- Check every bit position in a byte for a set bit, returning its index if we
+-- find one. We default return 7, even though this index is valid, as no
+-- consumer function ever looks at this value, since that can only happen on
+-- zero bytes, which we ignore anyway.
+findPosition :: Word8 -> Int
+findPosition w8 = foldl' go 7 . fmap (\i -> (i, bit 0 `shiftL` i)) $ [0 .. 7]
+  where
+    go :: Int -> (Int, Word8) -> Int
+    go acc (i, mask) = case mask .&. w8 of
+      0 -> acc -- nothing to see here, move along
+      _ -> min acc i
+
+-- A polymorphic bit counter in a block, which we can segment by chunks of a
+-- type of arbitrary size, provided it is both Storable (so we can read at
+-- offsets) and FiniteBits (so we can count it).
+countBits :: forall (a :: Type) .
+  (FiniteBits a, Storable a) =>
+  Ptr a -> Int -> IO Int
+countBits ptr len = go 0 0
+  where
+    go :: Int -> Int -> IO Int
+    go total offset
+      | offset == len = pure total
+      | otherwise = do
+          let offset' = offset * sizeOf (undefined :: a)
+          block :: a <- peek . plusPtr ptr $ offset'
+          let total' = total + popCount block
+          go total' (offset + 1)
+
+-- Check if every byte of a ByteString is zero
+isAllZero :: ByteString -> Bool
+isAllZero = BS.all (== zeroBits)
+
+-- Check if every byte of a ByteString is one
+isAllOne :: ByteString -> Bool
+isAllOne = BS.all (== complement zeroBits)
diff --git a/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs b/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs
index 4bb0d601bd9..109f66661de 100644
--- a/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs
+++ b/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs
@@ -20,18 +20,15 @@ import PlutusCore.Builtin
 import PlutusCore.Data
 import PlutusCore.Default.Universe
 import PlutusCore.Evaluation.Machine.BuiltinCostModel
+import PlutusCore.Evaluation.Machine.ExBudget (ExBudget (ExBudget))
 import PlutusCore.Evaluation.Machine.ExBudgetStream
 import PlutusCore.Evaluation.Machine.ExMemoryUsage
 import PlutusCore.Evaluation.Result
 import PlutusCore.Pretty
 
-import PlutusCore.Crypto.BLS12_381.G1 qualified as BLS12_381.G1
-import PlutusCore.Crypto.BLS12_381.G2 qualified as BLS12_381.G2
-import PlutusCore.Crypto.BLS12_381.Pairing qualified as BLS12_381.Pairing
-import PlutusCore.Crypto.Ed25519 (verifyEd25519Signature_V1, verifyEd25519Signature_V2)
-import PlutusCore.Crypto.Hash qualified as Hash
-import PlutusCore.Crypto.Secp256k1 (verifyEcdsaSecp256k1Signature, verifySchnorrSecp256k1Signature)
-
+import Bitwise (andByteString, byteStringToInteger, complementByteString, findFirstSetByteString,
+                integerToByteString, iorByteString, popCountByteString, rotateByteString,
+                shiftByteString, testBitByteString, writeBitByteString, xorByteString)
 import Codec.Serialise (serialise)
 import Data.ByteString qualified as BS
 import Data.ByteString.Lazy qualified as BSL
@@ -42,6 +39,12 @@ import Data.Text.Encoding (decodeUtf8', encodeUtf8)
 import Flat hiding (from, to)
 import Flat.Decoder
 import Flat.Encoder as Flat
+import PlutusCore.Crypto.BLS12_381.G1 qualified as BLS12_381.G1
+import PlutusCore.Crypto.BLS12_381.G2 qualified as BLS12_381.G2
+import PlutusCore.Crypto.BLS12_381.Pairing qualified as BLS12_381.Pairing
+import PlutusCore.Crypto.Ed25519 (verifyEd25519Signature_V1, verifyEd25519Signature_V2)
+import PlutusCore.Crypto.Hash qualified as Hash
+import PlutusCore.Crypto.Secp256k1 (verifyEcdsaSecp256k1Signature, verifySchnorrSecp256k1Signature)
 import Prettyprinter (viaShow)
 
 -- See Note [Pattern matching on built-in types].
@@ -147,6 +150,19 @@ data DefaultFun
     -- Keccak_256, Blake2b_224
     | Keccak_256
     | Blake2b_224
+    -- Bitwise
+    | IntegerToByteString
+    | ByteStringToInteger
+    | AndByteString
+    | IorByteString
+    | XorByteString
+    | ComplementByteString
+    | ShiftByteString
+    | RotateByteString
+    | PopCountByteString
+    | TestBitByteString
+    | WriteBitByteString
+    | FindFirstSetByteString
     deriving stock (Show, Eq, Ord, Enum, Bounded, Generic, Ix)
     deriving anyclass (NFData, Hashable, PrettyBy PrettyConfigPlc)
 
@@ -1470,6 +1486,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
         makeBuiltinMeaning
             BLS12_381.Pairing.finalVerify
             (runCostingFunTwoArguments . paramBls12_381_finalVerify)
+    -- Keccak_256, Blake2b_224
     toBuiltinMeaning _ver Keccak_256 =
         makeBuiltinMeaning
             Hash.keccak_256
@@ -1478,6 +1495,61 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
         makeBuiltinMeaning
             Hash.blake2b_224
             (runCostingFunOneArgument . paramBlake2b_224)
+    -- Bitwise
+    toBuiltinMeaning _ver IntegerToByteString =
+        makeBuiltinMeaning integerToByteStringPlc (\_ _ -> ExBudgetLast $ ExBudget 0 0)
+        where
+          integerToByteStringPlc :: SomeConstant uni Integer -> EvaluationResult BS.ByteString
+          integerToByteStringPlc (SomeConstant (Some (ValueOf uni n))) = do
+              DefaultUniInteger <- pure uni
+              case integerToByteString n of
+                Just bs -> pure $ bs
+                Nothing -> fail "negative integer passed to integerByteString"
+          {-# INLINE integerToByteStringPlc #-}
+    toBuiltinMeaning _ver ByteStringToInteger =
+        makeBuiltinMeaning
+            byteStringToInteger
+            (runCostingFunOneArgument . paramByteStringToInteger)
+    toBuiltinMeaning _ver AndByteString =
+        makeBuiltinMeaning
+            andByteString
+            (runCostingFunTwoArguments . paramAndByteString)
+    toBuiltinMeaning _ver IorByteString =
+        makeBuiltinMeaning
+            iorByteString
+            (runCostingFunTwoArguments . paramIorByteString)
+    toBuiltinMeaning _ver XorByteString =
+        makeBuiltinMeaning
+            xorByteString
+            (runCostingFunTwoArguments . paramXorByteString)
+    toBuiltinMeaning _ver ComplementByteString =
+        makeBuiltinMeaning
+            complementByteString
+            (runCostingFunOneArgument . paramComplementByteString)
+    toBuiltinMeaning _ver ShiftByteString =
+        makeBuiltinMeaning
+            shiftByteString
+            (runCostingFunTwoArguments . paramShiftByteString)
+    toBuiltinMeaning _ver RotateByteString =
+        makeBuiltinMeaning
+            rotateByteString
+            (runCostingFunTwoArguments . paramRotateByteString)
+    toBuiltinMeaning _ver PopCountByteString =
+        makeBuiltinMeaning
+            popCountByteString
+            (runCostingFunOneArgument . paramPopCountByteString)
+    toBuiltinMeaning _ver TestBitByteString =
+        makeBuiltinMeaning
+            testBitByteString
+            (runCostingFunTwoArguments . paramTestBitByteString)
+    toBuiltinMeaning _ver WriteBitByteString =
+        makeBuiltinMeaning
+            writeBitByteString
+            (runCostingFunThreeArguments . paramWriteBitByteString)
+    toBuiltinMeaning _ver FindFirstSetByteString =
+        makeBuiltinMeaning
+            findFirstSetByteString
+            (runCostingFunOneArgument . paramFindFirstSetByteString)
     -- See Note [Inlining meanings of builtins].
     {-# INLINE toBuiltinMeaning #-}
 
@@ -1585,6 +1657,18 @@ instance Flat DefaultFun where
               Bls12_381_finalVerify           -> 70
               Keccak_256                      -> 71
               Blake2b_224                     -> 72
+              IntegerToByteString             -> 73
+              ByteStringToInteger             -> 74
+              AndByteString                   -> 75
+              IorByteString                   -> 76
+              XorByteString                   -> 77
+              ComplementByteString            -> 78
+              ShiftByteString                 -> 79
+              RotateByteString                -> 80
+              PopCountByteString              -> 81
+              TestBitByteString               -> 82
+              WriteBitByteString              -> 83
+              FindFirstSetByteString          -> 84
 
     decode = go =<< decodeBuiltin
         where go 0  = pure AddInteger
@@ -1660,6 +1744,19 @@ instance Flat DefaultFun where
               go 70 = pure Bls12_381_finalVerify
               go 71 = pure Keccak_256
               go 72 = pure Blake2b_224
+              go 73 = pure IntegerToByteString
+              go 74 = pure ByteStringToInteger
+              go 75 = pure AndByteString
+              go 76 = pure IorByteString
+              go 77 = pure XorByteString
+              go 78 = pure ComplementByteString
+              go 79 = pure ShiftByteString
+              go 80 = pure RotateByteString
+              go 81 = pure PopCountByteString
+              go 82 = pure TestBitByteString
+              go 83 = pure WriteBitByteString
+              go 84 = pure FindFirstSetByteString
+
               go t  = fail $ "Failed to decode builtin tag, got: " ++ show t
 
     size _ n = n + builtinTagWidth
diff --git a/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/BuiltinCostModel.hs b/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/BuiltinCostModel.hs
index dca1307c969..938d1eb3522 100644
--- a/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/BuiltinCostModel.hs
+++ b/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/BuiltinCostModel.hs
@@ -157,6 +157,19 @@ data BuiltinCostModelBase f =
     -- Keccak_256, Blake2b_224
     , paramKeccak_256                      :: f ModelOneArgument
     , paramBlake2b_224                     :: f ModelOneArgument
+    -- Bitwise operations
+    , paramIntegerToByteString             :: f ModelOneArgument
+    , paramByteStringToInteger             :: f ModelOneArgument
+    , paramAndByteString                   :: f ModelTwoArguments
+    , paramIorByteString                   :: f ModelTwoArguments
+    , paramXorByteString                   :: f ModelTwoArguments
+    , paramComplementByteString            :: f ModelOneArgument
+    , paramShiftByteString                 :: f ModelTwoArguments
+    , paramRotateByteString                :: f ModelTwoArguments
+    , paramPopCountByteString              :: f ModelOneArgument
+    , paramTestBitByteString               :: f ModelTwoArguments
+    , paramWriteBitByteString              :: f ModelThreeArguments
+    , paramFindFirstSetByteString          :: f ModelOneArgument
     }
     deriving stock (Generic)
     deriving anyclass (FunctorB, TraversableB, ConstraintsB)
diff --git a/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/ExBudgetingDefaults.hs b/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/ExBudgetingDefaults.hs
index 67784d66d08..7283c46bfa5 100644
--- a/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/ExBudgetingDefaults.hs
+++ b/plutus-core/plutus-core/src/PlutusCore/Evaluation/Machine/ExBudgetingDefaults.hs
@@ -198,5 +198,18 @@ unitCostBuiltinCostModel = BuiltinCostModelBase
     -- Keccak_256, Blake2b_224
     , paramKeccak_256                      = unitCostOneArgument
     , paramBlake2b_224                     = unitCostOneArgument
+    -- Bitwise operations
+    , paramIntegerToByteString             = unitCostOneArgument
+    , paramByteStringToInteger             = unitCostOneArgument
+    , paramAndByteString                   = unitCostTwoArguments
+    , paramIorByteString                   = unitCostTwoArguments
+    , paramXorByteString                   = unitCostTwoArguments
+    , paramComplementByteString            = unitCostOneArgument
+    , paramShiftByteString                 = unitCostTwoArguments
+    , paramRotateByteString                = unitCostTwoArguments
+    , paramPopCountByteString              = unitCostOneArgument
+    , paramTestBitByteString               = unitCostTwoArguments
+    , paramWriteBitByteString              = unitCostThreeArguments
+    , paramFindFirstSetByteString          = unitCostOneArgument
     }
 
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/AndByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/AndByteString.plc.golden
new file mode 100644
index 00000000000..01714abd65b
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/AndByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con bytestring) (con bytestring)))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/ByteStringToInteger.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/ByteStringToInteger.plc.golden
new file mode 100644
index 00000000000..6495d849f0e
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/ByteStringToInteger.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (con integer))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/ComplementByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/ComplementByteString.plc.golden
new file mode 100644
index 00000000000..e8a4293b5b2
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/ComplementByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (con bytestring))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/FindFirstSetByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/FindFirstSetByteString.plc.golden
new file mode 100644
index 00000000000..6495d849f0e
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/FindFirstSetByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (con integer))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/IntegerToByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/IntegerToByteString.plc.golden
new file mode 100644
index 00000000000..e0f1cb81a8e
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/IntegerToByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con integer) (con bytestring))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/IorByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/IorByteString.plc.golden
new file mode 100644
index 00000000000..01714abd65b
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/IorByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con bytestring) (con bytestring)))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/PopCountByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/PopCountByteString.plc.golden
new file mode 100644
index 00000000000..6495d849f0e
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/PopCountByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (con integer))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/RotateByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/RotateByteString.plc.golden
new file mode 100644
index 00000000000..d98998e13d6
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/RotateByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con integer) (con bytestring)))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/ShiftByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/ShiftByteString.plc.golden
new file mode 100644
index 00000000000..d98998e13d6
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/ShiftByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con integer) (con bytestring)))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/TestBitByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/TestBitByteString.plc.golden
new file mode 100644
index 00000000000..a647f8f46a6
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/TestBitByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con integer) (con bool)))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/WriteBitByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/WriteBitByteString.plc.golden
new file mode 100644
index 00000000000..a4a93f8f64e
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/WriteBitByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con integer) (fun (con bool) (con bytestring))))
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/XorByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/XorByteString.plc.golden
new file mode 100644
index 00000000000..01714abd65b
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/XorByteString.plc.golden
@@ -0,0 +1 @@
+(fun (con bytestring) (fun (con bytestring) (con bytestring)))
\ No newline at end of file
diff --git a/plutus-core/plutus-ir/src/PlutusIR/Transform/CommuteFnWithConst.hs b/plutus-core/plutus-ir/src/PlutusIR/Transform/CommuteFnWithConst.hs
index 8b2e2cfc24c..1d20f473204 100644
--- a/plutus-core/plutus-ir/src/PlutusIR/Transform/CommuteFnWithConst.hs
+++ b/plutus-core/plutus-ir/src/PlutusIR/Transform/CommuteFnWithConst.hs
@@ -147,3 +147,16 @@ isCommutative = \case
   MkPairData                      -> False
   MkNilData                       -> False
   MkNilPairData                   -> False
+   -- Bitwise operations
+  IntegerToByteString             -> False
+  ByteStringToInteger             -> False
+  AndByteString                   -> True
+  IorByteString                   -> True
+  XorByteString                   -> True
+  ComplementByteString            -> False
+  ShiftByteString                 -> False
+  RotateByteString                -> False
+  PopCountByteString              -> False
+  TestBitByteString               -> False
+  WriteBitByteString              -> False
+  FindFirstSetByteString          -> False
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Bitwise.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Bitwise.hs
new file mode 100644
index 00000000000..8f1b9cce42d
--- /dev/null
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Bitwise.hs
@@ -0,0 +1,1118 @@
+-- editorconfig-checker-disable-file
+{-# LANGUAGE LambdaCase        #-}
+{-# LANGUAGE MultiWayIf        #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TypeApplications  #-}
+
+module Evaluation.Builtins.Bitwise (
+  bitwiseAndCommutes,
+  bitwiseIorCommutes,
+  bitwiseXorCommutes,
+  bitwiseAndIdentity,
+  bitwiseIorIdentity,
+  bitwiseXorIdentity,
+  bitwiseAndAbsorbing,
+  bitwiseIorAbsorbing,
+  bitwiseXorComplement,
+  bitwiseAndSelf,
+  bitwiseIorSelf,
+  bitwiseXorSelf,
+  bitwiseAndAssociates,
+  bitwiseIorAssociates,
+  bitwiseXorAssociates,
+  bitwiseComplementSelfInverts,
+  bitwiseAndDeMorgan,
+  bitwiseIorDeMorgan,
+  popCountSingleByte,
+  popCountAppend,
+  testBitEmpty,
+  testBitSingleByte,
+  testBitAppend,
+  writeBitRead,
+  writeBitDouble,
+  writeBitAgreement,
+  ffsSingleByte,
+  ffsAppend,
+  rotateIdentity,
+  rotateIndexMotion,
+  rotateHomogenous,
+  rotateSum,
+  shiftIdentity,
+  shiftIndexMotion,
+  shiftHomogenous,
+  shiftSum,
+  iToBsRoundtrip,
+  bsToITrailing,
+  ) where
+
+
+import Control.Lens.Fold (Fold, folding, has, hasn't, preview)
+import Control.Monad (guard)
+import Data.Bitraversable (bitraverse)
+import Data.Bits (bit, complement, countTrailingZeros, popCount, shiftL, xor, zeroBits, (.&.),
+                  (.|.))
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.Text (Text)
+import Data.Word (Word8)
+import Evaluation.Builtins.Common (typecheckEvaluateCek)
+import GHC.Exts (fromListN, toList)
+import Hedgehog (Gen, PropertyT, Range, annotate, annotateShow, cover, evalEither, failure,
+                 forAllWith, success, (===))
+import Hedgehog.Gen qualified as Gen
+import Hedgehog.Range qualified as Range
+import PlutusCore (DefaultFun (AddInteger, AndByteString, AppendByteString, ByteStringToInteger, ComplementByteString, FindFirstSetByteString, IntegerToByteString, IorByteString, PopCountByteString, RotateByteString, ShiftByteString, TestBitByteString, WriteBitByteString, XorByteString),
+                   DefaultUni, Error, EvaluationResult (EvaluationFailure, EvaluationSuccess), Name,
+                   Term)
+import PlutusCore.Evaluation.Machine.ExBudgetingDefaults (defaultBuiltinCostModel)
+import PlutusCore.MkPlc (builtin, mkConstant, mkIterAppNoAnn)
+import PlutusPrelude (def)
+import Text.Show.Pretty (ppShow)
+import UntypedPlutusCore qualified as Untyped
+
+bitwiseIorCommutes :: PropertyT IO ()
+bitwiseIorCommutes = commutative (.|.) IorByteString
+
+bitwiseAndCommutes :: PropertyT IO ()
+bitwiseAndCommutes = commutative (.&.) AndByteString
+
+bitwiseXorCommutes :: PropertyT IO ()
+bitwiseXorCommutes = commutative xor XorByteString
+
+bitwiseAndIdentity :: PropertyT IO ()
+bitwiseAndIdentity = identity (complement zeroBits) AndByteString
+
+bitwiseIorIdentity :: PropertyT IO ()
+bitwiseIorIdentity = identity zeroBits IorByteString
+
+bitwiseXorIdentity :: PropertyT IO ()
+bitwiseXorIdentity = identity zeroBits XorByteString
+
+bitwiseAndAbsorbing :: PropertyT IO ()
+bitwiseAndAbsorbing = absorbing zeroBits AndByteString
+
+bitwiseIorAbsorbing :: PropertyT IO ()
+bitwiseIorAbsorbing = absorbing (complement zeroBits) IorByteString
+
+bitwiseXorComplement :: PropertyT IO ()
+bitwiseXorComplement = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let len = BS.length bs
+  let allOnes = BS.replicate len . complement $ zeroBits
+  outcome1 <- goXor bs allOnes
+  outcome2 <- goComplement bs
+  case (outcome1, outcome2) of
+    (EvaluationSuccess res1, EvaluationSuccess res2) -> res1 === res2
+    _                                                -> failure
+  where
+    goXor ::
+      ByteString ->
+      ByteString ->
+      PropertyT IO (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()))
+    goXor leftArg rightArg = do
+      let leftArg' = mkConstant @ByteString () leftArg
+      let rightArg' = mkConstant @ByteString () rightArg
+      let comp = mkIterAppNoAnn (builtin () XorByteString) [leftArg', rightArg']
+      cekEval comp
+    goComplement ::
+      ByteString ->
+      PropertyT IO (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()))
+    goComplement bs = do
+      let bs' = mkConstant @ByteString () bs
+      let comp = mkIterAppNoAnn (builtin () ComplementByteString) [bs']
+      cekEval comp
+
+bitwiseAndSelf :: PropertyT IO ()
+bitwiseAndSelf = self AndByteString
+
+bitwiseIorSelf :: PropertyT IO ()
+bitwiseIorSelf = self IorByteString
+
+bitwiseXorSelf :: PropertyT IO ()
+bitwiseXorSelf = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let len = BS.length bs
+  let bs' = mkConstant @ByteString () bs
+  let expected = mkConstant @ByteString () . BS.replicate len $ zeroBits
+  let comp = mkIterAppNoAnn (builtin () XorByteString) [bs', bs']
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === expected
+    _                     -> failure
+
+bitwiseAndAssociates :: PropertyT IO ()
+bitwiseAndAssociates = associative (.&.) AndByteString
+
+bitwiseIorAssociates :: PropertyT IO ()
+bitwiseIorAssociates = associative (.|.) IorByteString
+
+bitwiseXorAssociates :: PropertyT IO ()
+bitwiseXorAssociates = associative xor XorByteString
+
+bitwiseComplementSelfInverts :: PropertyT IO ()
+bitwiseComplementSelfInverts = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let bs' = mkConstant @ByteString () bs
+  let comp = mkIterAppNoAnn (builtin () ComplementByteString) [
+        mkIterAppNoAnn (builtin () ComplementByteString) [bs']
+        ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant () bs
+    _                     -> failure
+
+bitwiseAndDeMorgan :: PropertyT IO ()
+bitwiseAndDeMorgan = demorgan AndByteString IorByteString
+
+bitwiseIorDeMorgan :: PropertyT IO ()
+bitwiseIorDeMorgan = demorgan IorByteString AndByteString
+
+popCountSingleByte :: PropertyT IO ()
+popCountSingleByte = do
+  w8 <- forAllWith ppShow Gen.enumBounded
+  let bs = BS.singleton w8
+  let expected :: Integer = fromIntegral . popCount $ w8
+  let comp = mkIterAppNoAnn (builtin () PopCountByteString) [
+        mkConstant @ByteString () bs
+        ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant () expected
+    _                     -> failure
+
+popCountAppend :: PropertyT IO ()
+popCountAppend = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  bs' <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let arg1 = mkConstant @ByteString () bs
+  let arg2 = mkConstant @ByteString () bs'
+  let comp1 = mkIterAppNoAnn (builtin () PopCountByteString) [
+        mkIterAppNoAnn (builtin () AppendByteString) [arg1, arg2]
+        ]
+  let comp2 = mkIterAppNoAnn (builtin () AddInteger) [
+        mkIterAppNoAnn (builtin () PopCountByteString) [arg1],
+        mkIterAppNoAnn (builtin () PopCountByteString) [arg2]
+        ]
+  outcome <- bitraverse cekEval cekEval (comp1, comp2)
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+testBitEmpty :: PropertyT IO ()
+testBitEmpty = do
+  ix <- forAllWith ppShow . Gen.integral $ indexRange
+  let arg = mkConstant @ByteString () ""
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+        arg,
+        mkConstant @Integer () ix
+        ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationFailure -> success
+    _                 -> failure
+
+testBitSingleByte :: PropertyT IO ()
+testBitSingleByte = do
+  w8 <- forAllWith ppShow Gen.enumBounded
+  let bs = BS.singleton w8
+  ix <- forAllWith ppShow . Gen.integral . indexRangeOf $ 8
+  cover 45 "out of bounds" $ ix < 0 || ix >= 8
+  cover 45 "in-bounds" $ 0 <= ix && ix < 8
+  let expected = bitAt w8 ix
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () ix
+        ]
+  outcome <- cekEval comp
+  case (expected, outcome) of
+    (Nothing, EvaluationFailure)    -> success
+    (Just b, EvaluationSuccess res) -> res === mkConstant @Bool () b
+    _                               -> failure
+
+testBitAppend :: PropertyT IO ()
+testBitAppend = do
+  testCase <- forAllWith ppShow genBitAppendCase
+  cover 30 "out of bounds" . appendOutOfBounds $ testCase
+  cover 30 "in-bounds, first argument" . appendInBoundsFirst $ testCase
+  cover 30 "in-bounds, second argument" . appendInBoundsSecond $ testCase
+  let (x, y, ix) = getBitAppendArgs testCase
+  let arg1 = mkConstant @ByteString () x
+  let arg2 = mkConstant @ByteString () y
+  let argIx = mkConstant @Integer () ix
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+        mkIterAppNoAnn (builtin () AppendByteString) [arg1, arg2],
+        argIx
+        ]
+  let comp' = go x y ix
+  outcome <- bitraverse cekEval cekEval (comp, comp')
+  case outcome of
+    (EvaluationFailure, EvaluationFailure) -> success
+    (EvaluationSuccess res, EvaluationSuccess res') -> do
+      annotateShow res
+      annotateShow res'
+      res === res'
+    _ -> failure
+  where
+    go ::
+      ByteString ->
+      ByteString ->
+      Integer ->
+      Term Untyped.TyName Name DefaultUni DefaultFun ()
+    go bs bs' ix = let len' = fromIntegral $ 8 * BS.length bs' in
+      case compare ix len' of
+        LT -> mkIterAppNoAnn (builtin () TestBitByteString) [
+                mkConstant @ByteString () bs',
+                mkConstant @Integer () ix
+                ]
+        _ -> mkIterAppNoAnn (builtin () TestBitByteString) [
+                mkConstant @ByteString () bs,
+                mkConstant @Integer () (ix - len')
+                ]
+
+writeBitRead :: PropertyT IO ()
+writeBitRead = do
+  testCase <- forAllWith ppShow genWriteBitCase
+  cover 45 "out of bounds" . hasn't _WriteBitResult $ testCase
+  cover 45 "in-bounds" . has _WriteBitResult $ testCase
+  let (bs, ix, b) = getWriteBitArgs testCase
+  let expected = preview _WriteBitResult testCase
+  let bs' = mkConstant @ByteString () bs
+  let ix' = mkConstant @Integer () ix
+  let b' = mkConstant @Bool () b
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+        mkIterAppNoAnn (builtin () WriteBitByteString) [bs', ix', b'],
+        ix'
+        ]
+  outcome <- cekEval comp
+  case (expected, outcome) of
+    (Nothing, EvaluationFailure)       -> success
+    (Just res, EvaluationSuccess res') -> mkConstant @Bool () res === res'
+    _                                  -> failure
+
+writeBitDouble :: PropertyT IO ()
+writeBitDouble = do
+  testCase <- forAllWith ppShow genWriteBitCase
+  cover 45 "out of bounds" . hasn't _WriteBitResult $ testCase
+  cover 45 "in-bounds" . has _WriteBitResult $ testCase
+  let (bs, ix, b) = getWriteBitArgs testCase
+  b' <- forAllWith ppShow Gen.enumBounded
+  let bs' = mkConstant @ByteString () bs
+  let ix' = mkConstant @Integer () ix
+  let writeTwice = mkIterAppNoAnn (builtin () WriteBitByteString) [
+        mkIterAppNoAnn (builtin () WriteBitByteString) [bs', ix', mkConstant @Bool () b],
+        ix',
+        mkConstant @Bool () b'
+        ]
+  let writeOnce = mkIterAppNoAnn (builtin () WriteBitByteString) [
+        bs',
+        ix',
+        mkConstant @Bool () b'
+        ]
+  outcome <- bitraverse cekEval cekEval (writeTwice, writeOnce)
+  case outcome of
+    (EvaluationFailure, EvaluationFailure)          -> success
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+writeBitAgreement :: PropertyT IO ()
+writeBitAgreement = do
+  testCase <- forAllWith ppShow genWriteBitAgreementCase
+  let (bs, writeIx, readIx) = getWriteBitAgreementArgs testCase
+  cover 45 "read known zero" $ writeIx /= readIx
+  cover 45 "read known one" $ writeIx == readIx
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+          mkIterAppNoAnn (builtin () WriteBitByteString) [
+            mkConstant @ByteString () bs,
+            mkConstant @Integer () writeIx,
+            mkConstant @Bool () True
+            ],
+          mkConstant @Integer () readIx
+          ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res ->
+      if writeIx == readIx
+      then res === mkConstant @Bool () True
+      else res === mkConstant @Bool () False
+    _ -> failure
+
+ffsSingleByte :: PropertyT IO ()
+ffsSingleByte = do
+  w8 <- forAllWith ppShow Gen.enumBounded
+  let bs = BS.singleton w8
+  let expected = case w8 of
+        0 -> (-1)
+        _ -> fromIntegral . countTrailingZeros $ w8
+  let comp = mkIterAppNoAnn (builtin () FindFirstSetByteString) [
+        mkConstant @ByteString () bs
+        ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant @Integer () expected
+    _                     -> failure
+
+ffsAppend :: PropertyT IO ()
+ffsAppend = do
+  testCase <- forAllWith ppShow genFFSAppendCase
+  let which = ffsAppendType testCase
+  cover 30 "both arguments zero" $ which == ZeroBoth
+  cover 30 "second argument zero" $ which == ZeroSecond
+  cover 30 "second argument nonzero" $ which == NotZeroSecond
+  let (bs, bs') = getFFSAppendArgs testCase
+  let comp = mkIterAppNoAnn (builtin () FindFirstSetByteString) [
+        mkIterAppNoAnn (builtin () AppendByteString) [
+          mkConstant @ByteString () bs,
+          mkConstant @ByteString () bs'
+          ]
+        ]
+  let comp' = case which of
+        ZeroBoth -> mkConstant @Integer () (-1)
+        ZeroSecond -> let bitLen' = fromIntegral $ 8 * BS.length bs' in
+          mkIterAppNoAnn (builtin () AddInteger) [
+            mkIterAppNoAnn (builtin () FindFirstSetByteString) [
+              mkConstant @ByteString () bs
+              ],
+            mkConstant @Integer () bitLen'
+          ]
+        NotZeroSecond -> mkIterAppNoAnn (builtin () FindFirstSetByteString) [
+            mkConstant @ByteString () bs'
+            ]
+  outcome <- bitraverse cekEval cekEval (comp, comp')
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+rotateIdentity :: PropertyT IO ()
+rotateIdentity = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let comp = mkIterAppNoAnn (builtin () RotateByteString) [
+          mkConstant @ByteString () bs,
+          mkConstant @Integer () 0
+          ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant () bs
+    _                     -> failure
+
+shiftIdentity :: PropertyT IO ()
+shiftIdentity = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let comp = mkIterAppNoAnn (builtin () ShiftByteString) [
+          mkConstant @ByteString () bs,
+          mkConstant @Integer () 0
+          ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant () bs
+    _                     -> failure
+
+
+rotateIndexMotion :: PropertyT IO ()
+rotateIndexMotion = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  w8 <- forAllWith ppShow Gen.enumBounded
+  let bs' = BS.cons w8 bs
+  let bitLen = fromIntegral $ BS.length bs' * 8
+  i <- forAllWith ppShow . Gen.integral . indexRangeOf $ bitLen
+  readIx <- forAllWith ppShow . Gen.integral . indexRangeFor $ bitLen
+  let expectedReadIx = case signum i of
+        1 -> let raw = readIx - i in
+              case signum raw of
+                (-1) -> bitLen + raw
+                _    -> raw
+        0 -> readIx
+        _ -> (readIx - i) `rem` bitLen
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+        mkIterAppNoAnn (builtin () RotateByteString) [
+          mkConstant @ByteString () bs',
+          mkConstant @Integer () i
+          ],
+        mkConstant @Integer () readIx
+        ]
+  let expected = mkIterAppNoAnn (builtin () TestBitByteString) [
+        mkConstant @ByteString () bs',
+        mkConstant @Integer () expectedReadIx
+        ]
+  outcome <- bitraverse cekEval cekEval (expected, comp)
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess actual) -> res === actual
+    _                                                 -> failure
+
+shiftIndexMotion :: PropertyT IO ()
+shiftIndexMotion = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  w8 <- forAllWith ppShow Gen.enumBounded
+  let bs' = BS.cons w8 bs
+  let bitLen = fromIntegral $ BS.length bs' * 8
+  i <- forAllWith ppShow . Gen.integral . indexRangeOf $ bitLen
+  readIx <- forAllWith ppShow . Gen.integral . indexRangeFor $ bitLen
+  let comp = mkIterAppNoAnn (builtin () TestBitByteString) [
+        mkIterAppNoAnn (builtin () ShiftByteString) [
+          mkConstant @ByteString () bs',
+          mkConstant @Integer () i
+          ],
+        mkConstant @Integer () readIx
+        ]
+  let comp' = let expectedIx = readIx - i in
+        if | expectedIx < 0 -> mkConstant @Bool () False
+           | expectedIx >= bitLen -> mkConstant @Bool () False
+           | otherwise -> mkIterAppNoAnn (builtin () TestBitByteString) [
+                            mkConstant @ByteString () bs',
+                            mkConstant @Integer () expectedIx
+                            ]
+  outcome <- bitraverse cekEval cekEval (comp, comp')
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+rotateHomogenous :: PropertyT IO ()
+rotateHomogenous = do
+  w8 <- forAllWith ppShow . Gen.element $ [zeroBits, complement zeroBits]
+  cover 45 "all ones" $ w8 == complement zeroBits
+  cover 45 "all zeroes" $ w8 == zeroBits
+  len <- forAllWith ppShow . Gen.integral $ byteBoundRange
+  let bs = BS.replicate len w8
+  rotation <- forAllWith ppShow . Gen.integral $ indexRange
+  let comp = mkIterAppNoAnn (builtin () RotateByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () rotation
+        ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant @ByteString () bs
+    _                     -> failure
+
+shiftHomogenous :: PropertyT IO ()
+shiftHomogenous = do
+  len <- forAllWith ppShow . Gen.integral $ byteBoundRange
+  i <- forAllWith ppShow . Gen.integral $ indexRange
+  let bs = BS.replicate len zeroBits
+  let comp = mkIterAppNoAnn (builtin () ShiftByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () i
+        ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant @ByteString () bs
+    _                     -> failure
+
+rotateSum :: PropertyT IO ()
+rotateSum = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  i <- forAllWith ppShow . Gen.integral $ indexRange
+  j <- forAllWith ppShow . Gen.integral $ indexRange
+  let comp1 = mkIterAppNoAnn (builtin () RotateByteString) [
+        mkIterAppNoAnn (builtin () RotateByteString) [
+          mkConstant @ByteString () bs,
+          mkConstant @Integer () i
+          ],
+        mkConstant @Integer () j
+        ]
+  let comp2 = mkIterAppNoAnn (builtin () RotateByteString) [
+        mkConstant @ByteString () bs,
+        mkIterAppNoAnn (builtin () AddInteger) [
+          mkConstant @Integer () i,
+          mkConstant @Integer () j
+          ]
+        ]
+  outcome <- bitraverse cekEval cekEval (comp1, comp2)
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+shiftSum :: PropertyT IO ()
+shiftSum = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  ij <- forAllWith ppShow . Gen.integral $ indexRange
+  (i, j) <- forAllWith ppShow . genSplit $ ij
+  let comp1 = mkIterAppNoAnn (builtin () ShiftByteString) [
+        mkIterAppNoAnn (builtin () ShiftByteString) [
+          mkConstant @ByteString () bs,
+          mkConstant @Integer () i
+          ],
+        mkConstant @Integer () j
+        ]
+  let comp2 = mkIterAppNoAnn (builtin () ShiftByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () ij
+        ]
+  outcome <- bitraverse cekEval cekEval (comp1, comp2)
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+iToBsRoundtrip :: PropertyT IO ()
+iToBsRoundtrip = do
+  i <- forAllWith ppShow . Gen.integral $ integerRange
+  let comp = mkIterAppNoAnn (builtin () ByteStringToInteger) [
+              mkIterAppNoAnn (builtin () IntegerToByteString) [
+                mkConstant @Integer () i
+                ]
+              ]
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant @Integer () i
+    _                     -> failure
+
+bsToITrailing :: PropertyT IO ()
+bsToITrailing = do
+  BsToITrailingCase extension bs <- forAllWith ppShow genBsToITrailingCase
+  let comp = mkIterAppNoAnn (builtin () ByteStringToInteger) [
+              mkIterAppNoAnn (builtin () AppendByteString) [
+                mkConstant @ByteString () bs,
+                mkConstant @ByteString () extension
+                ]
+              ]
+  let comp' = mkIterAppNoAnn (builtin () ByteStringToInteger) [
+                mkConstant @ByteString () bs
+                ]
+  outcome <- bitraverse cekEval cekEval (comp, comp')
+  case outcome of
+    (EvaluationSuccess res, EvaluationSuccess res') -> res === res'
+    _                                               -> failure
+
+-- Helpers
+
+data BsToITrailingCase = BsToITrailingCase ByteString ByteString
+  deriving stock (Eq, Show)
+
+data WriteBitAgreementCase =
+  WriteBitReadSame Int Integer |
+  WriteBitReadDifferent Int Integer Integer
+  deriving stock (Eq, Show)
+
+getWriteBitAgreementArgs :: WriteBitAgreementCase -> (ByteString, Integer, Integer)
+getWriteBitAgreementArgs = \case
+  WriteBitReadSame len ix          -> (BS.replicate len zeroBits, ix, ix)
+  WriteBitReadDifferent len ix ix' -> (BS.replicate len zeroBits, ix, ix')
+
+data FFSAppendType = ZeroBoth | ZeroSecond | NotZeroSecond
+  deriving stock (Eq)
+
+data FFSAppendCase =
+  FFSAppendBothZero Int Int |
+  FFSAppendSecondZero ByteString Int |
+  FFSAppendSecondNonZero ByteString ByteString
+  deriving stock (Eq, Show)
+
+getFFSAppendArgs :: FFSAppendCase -> (ByteString, ByteString)
+getFFSAppendArgs = \case
+  FFSAppendBothZero len len'    -> (BS.replicate len zeroBits, BS.replicate len' zeroBits)
+  FFSAppendSecondZero bs len    -> (bs, BS.replicate len zeroBits)
+  FFSAppendSecondNonZero bs bs' -> (bs, bs')
+
+ffsAppendType :: FFSAppendCase -> FFSAppendType
+ffsAppendType = \case
+  FFSAppendBothZero{}      -> ZeroBoth
+  FFSAppendSecondZero{}    -> ZeroSecond
+  FFSAppendSecondNonZero{} -> NotZeroSecond
+
+data WriteBitCase =
+  WriteBitOutOfBounds ByteString Integer Bool |
+  WriteBitInBounds ByteString Integer Bool
+  deriving stock (Eq, Show)
+
+_WriteBitResult :: Fold WriteBitCase Bool
+_WriteBitResult = folding $ \case
+  WriteBitInBounds _ _ b -> pure b
+  _                      -> Nothing
+
+getWriteBitArgs :: WriteBitCase -> (ByteString, Integer, Bool)
+getWriteBitArgs = \case
+  WriteBitOutOfBounds bs ix b -> (bs, ix, b)
+  WriteBitInBounds bs ix b    -> (bs, ix, b)
+
+data BitAppendCase =
+  AppendOutOfBounds ByteString ByteString Integer |
+  AppendInBoundsFirst ByteString ByteString Integer |
+  AppendInBoundsSecond ByteString ByteString Integer
+  deriving stock (Eq, Show)
+
+appendOutOfBounds :: BitAppendCase -> Bool
+appendOutOfBounds = \case
+  AppendOutOfBounds{} -> True
+  _                   -> False
+
+appendInBoundsFirst :: BitAppendCase -> Bool
+appendInBoundsFirst = \case
+  AppendInBoundsFirst{} -> True
+  _                     -> False
+
+appendInBoundsSecond :: BitAppendCase -> Bool
+appendInBoundsSecond = \case
+  AppendInBoundsSecond{} -> True
+  _                      -> False
+
+getBitAppendArgs :: BitAppendCase -> (ByteString, ByteString, Integer)
+getBitAppendArgs = \case
+  AppendOutOfBounds bs bs' ix    -> (bs, bs', ix)
+  AppendInBoundsFirst bs bs' ix  -> (bs, bs', ix)
+  AppendInBoundsSecond bs bs' ix -> (bs, bs', ix)
+
+bitAt :: Word8 -> Integer -> Maybe Bool
+bitAt w8 ix = do
+  guard (ix >= 0)
+  guard (ix < 8)
+  let mask = bit 0 `shiftL` fromIntegral ix
+  pure $ case mask .&. w8 of
+    0 -> False
+    _ -> True
+
+demorgan ::
+  DefaultFun ->
+  DefaultFun ->
+  PropertyT IO ()
+demorgan b b' = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let len = BS.length bs
+  bs' <- forAllWith ppShow . Gen.bytes . Range.singleton $ len
+  outcome <- demorganing b b' bs bs'
+  case outcome of
+    (EvaluationSuccess res1, EvaluationSuccess res2) -> res1 === res2
+    _                                                -> failure
+
+demorganing ::
+  DefaultFun ->
+  DefaultFun ->
+  ByteString ->
+  ByteString ->
+  PropertyT IO (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()),
+                EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()))
+demorganing fun fun' x y = do
+  let x' = mkConstant @ByteString () x
+  let y' = mkConstant @ByteString () y
+  let comp = mkIterAppNoAnn (builtin () ComplementByteString) [
+        mkIterAppNoAnn (builtin () fun) [x', y']
+        ]
+  let comp' = mkIterAppNoAnn (builtin () fun') [
+        mkIterAppNoAnn (builtin () ComplementByteString) [x'],
+        mkIterAppNoAnn (builtin () ComplementByteString) [y']
+        ]
+  bitraverse cekEval cekEval (comp, comp')
+
+data AssociativeCase =
+  AssociativeMismatched ByteString ByteString ByteString |
+  AssociativeMatched ByteString ByteString ByteString ByteString
+  deriving stock (Eq, Show)
+
+getAssociativeArgs :: AssociativeCase -> (ByteString, ByteString, ByteString)
+getAssociativeArgs = \case
+  AssociativeMismatched x y z -> (x, y, z)
+  AssociativeMatched x y z _  -> (x, y, z)
+
+_AssociativeResult :: Fold AssociativeCase ByteString
+_AssociativeResult = folding $ \case
+  AssociativeMatched _ _ _ res -> pure res
+  _                            -> Nothing
+
+associative ::
+  (Word8 -> Word8 -> Word8) ->
+  DefaultFun ->
+  PropertyT IO ()
+associative f b = do
+  testCase <- forAllWith ppShow . genAssociativeCase $ f
+  cover 45 "mismatched lengths" . hasn't _AssociativeResult $ testCase
+  cover 45 "matched lengths" . has _AssociativeResult $ testCase
+  let expectedMay = preview _AssociativeResult testCase
+  let (x, y, z) = getAssociativeArgs testCase
+  outcome <- associatively b x y z
+  case (outcome, expectedMay) of
+    ((EvaluationFailure, EvaluationFailure), Nothing) -> success
+    (_, Nothing) -> annotate "Unexpected failure" >> failure
+    ((EvaluationSuccess leftAssoc, EvaluationSuccess rightAssoc), Just expected) -> do
+      leftAssoc === rightAssoc
+      leftAssoc === mkConstant () expected
+    _ -> annotate "Unexpected failure" >> failure
+
+associatively ::
+  DefaultFun ->
+  ByteString ->
+  ByteString ->
+  ByteString ->
+  PropertyT IO (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()),
+                EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()))
+associatively fun x y z = do
+  let x' = mkConstant @ByteString () x
+  let y' = mkConstant @ByteString () y
+  let z' = mkConstant @ByteString () z
+  let leftAssoc = mkIterAppNoAnn (builtin () fun) [
+        mkIterAppNoAnn (builtin () fun) [x', y'],
+        z'
+        ]
+  let rightAssoc = mkIterAppNoAnn (builtin () fun) [
+        x',
+        mkIterAppNoAnn (builtin () fun) [y', z']
+        ]
+  bitraverse cekEval cekEval (leftAssoc, rightAssoc)
+
+self :: DefaultFun -> PropertyT IO ()
+self b = do
+  bs <- forAllWith ppShow . Gen.bytes $ byteBoundRange
+  let bs' = mkConstant @ByteString () bs
+  let comp = mkIterAppNoAnn (builtin () b) [bs', bs']
+  outcome <- cekEval comp
+  case outcome of
+    EvaluationSuccess res -> res === mkConstant @ByteString () bs
+    _                     -> failure
+
+data AbsorbingCase =
+  AbsorbingMismatched ByteString Int Word8 |
+  AbsorbingMatched ByteString Word8
+  deriving stock (Eq, Show)
+
+_AbsorbingResult :: Fold AbsorbingCase ByteString
+_AbsorbingResult = folding $ \case
+  AbsorbingMatched bs w8 -> pure . BS.replicate (BS.length bs) $ w8
+  _                      -> Nothing
+
+getAbsorbingArgs :: AbsorbingCase -> (ByteString, ByteString)
+getAbsorbingArgs = \case
+  AbsorbingMismatched bs len w8 -> (bs, BS.replicate len w8)
+  AbsorbingMatched bs w8        -> (bs, BS.replicate (BS.length bs) w8)
+
+absorbing ::
+  Word8 ->
+  DefaultFun ->
+  PropertyT IO ()
+absorbing w8 b = do
+  testCase <- forAllWith ppShow . genAbsorbingCase $ w8
+  cover 45 "mismatched lengths" . hasn't _AbsorbingResult $ testCase
+  cover 45 "matched lengths" . has _AbsorbingResult $ testCase
+  let expectedMay = preview _AbsorbingResult testCase
+  let (leftArg, rightArg) = getAbsorbingArgs testCase
+  outcome <- commutatively b leftArg rightArg
+  case (outcome, expectedMay) of
+    ((EvaluationFailure, EvaluationFailure), Nothing) -> success
+    (_, Nothing) -> do
+      annotate "Unexpected success"
+      failure
+    ((EvaluationSuccess l2r, EvaluationSuccess r2l), Just expected) -> do
+      l2r === r2l
+      l2r === mkConstant () expected
+    _ -> do
+      annotate "Unexpected failure"
+      failure
+
+data IdentityCase =
+  IdentityMismatched ByteString Int Word8 |
+  IdentityMatched ByteString Word8
+  deriving stock (Eq, Show)
+
+_IdentityResult :: Fold IdentityCase ByteString
+_IdentityResult = folding $ \case
+  IdentityMatched res _ -> pure res
+  _                     -> Nothing
+
+getIdentityArgs :: IdentityCase -> (ByteString, ByteString)
+getIdentityArgs = \case
+  IdentityMismatched bs len w8 -> (bs, BS.replicate len w8)
+  IdentityMatched bs w8        -> (bs, BS.replicate (BS.length bs) w8)
+
+identity ::
+  Word8 ->
+  DefaultFun ->
+  PropertyT IO ()
+identity w8 b = do
+  testCase <- forAllWith ppShow . genIdentityCase $ w8
+  cover 45 "mismatched lengths" . hasn't _IdentityResult $ testCase
+  cover 45 "matched lengths" . has _IdentityResult $ testCase
+  let expectedMay = preview _IdentityResult testCase
+  let (leftArg, rightArg) = getIdentityArgs testCase
+  outcome <- commutatively b leftArg rightArg
+  case (outcome, expectedMay) of
+    ((EvaluationFailure, EvaluationFailure), Nothing) -> success
+    (_, Nothing) -> do
+      annotate "Unexpected success"
+      failure
+    ((EvaluationSuccess l2r, EvaluationSuccess r2l), Just expected) -> do
+      l2r === r2l
+      l2r === mkConstant () expected
+    _ -> do
+      annotate "Unexpected failure"
+      failure
+
+data CommutativeCase =
+  MismatchedLengths ByteString ByteString |
+  MatchedLengths ByteString ByteString ByteString
+  deriving stock (Eq, Show)
+
+getArgs :: CommutativeCase -> (ByteString, ByteString)
+getArgs = \case
+  MismatchedLengths bs bs' -> (bs, bs')
+  MatchedLengths bs bs' _  -> (bs, bs')
+
+_CommutativeResult :: Fold CommutativeCase ByteString
+_CommutativeResult = folding $ \case
+  MatchedLengths _ _ res -> pure res
+  _                      -> Nothing
+
+commutative ::
+  (Word8 -> Word8 -> Word8) ->
+  DefaultFun ->
+  PropertyT IO ()
+commutative f b = do
+  testCase <- forAllWith ppShow . genCommutativeCase $ f
+  cover 45 "mismatched lengths" . hasn't _CommutativeResult $ testCase
+  cover 45 "matched lengths" . has _CommutativeResult $ testCase
+  let expectedMay = preview _CommutativeResult testCase
+  let (leftArg, rightArg) = getArgs testCase
+  outcome <- commutatively b leftArg rightArg
+  case (outcome, expectedMay) of
+    ((EvaluationFailure, EvaluationFailure), Nothing) -> success
+    (_, Nothing) -> do
+      annotate "Unexpected success"
+      failure
+    ((EvaluationSuccess l2r, EvaluationSuccess r2l), Just expected) -> do
+      l2r === r2l
+      l2r === mkConstant () expected
+    _ -> do
+      annotate "Unexpected failure"
+      failure
+
+commutatively ::
+  DefaultFun ->
+  ByteString ->
+  ByteString ->
+  PropertyT IO (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()),
+                EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()))
+commutatively fun leftArg rightArg = do
+  let leftArg' = mkConstant @ByteString () leftArg
+  let rightArg' = mkConstant @ByteString () rightArg
+  let oneDirection = go leftArg' rightArg'
+  let otherDirection = go rightArg' leftArg'
+  bitraverse cekEval cekEval (oneDirection, otherDirection)
+  where
+    go :: Term Untyped.TyName Name DefaultUni DefaultFun () ->
+          Term Untyped.TyName Name DefaultUni DefaultFun () ->
+          Term Untyped.TyName Name DefaultUni DefaultFun ()
+    go arg1 arg2 = mkIterAppNoAnn (builtin () fun) [arg1, arg2]
+
+cekEval ::
+  Term Untyped.TyName Name DefaultUni DefaultFun () ->
+  PropertyT IO (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()))
+cekEval = fmap fst . evalEither . cekEval'
+
+cekEval' ::
+  Term Untyped.TyName Name DefaultUni DefaultFun () ->
+  Either (Error DefaultUni DefaultFun ())
+         (EvaluationResult (Untyped.Term Name DefaultUni DefaultFun ()), [Text])
+cekEval' = typecheckEvaluateCek def defaultBuiltinCostModel
+
+-- Generators
+
+genBsToITrailingCase :: Gen BsToITrailingCase
+genBsToITrailingCase = go
+  where
+    go :: Gen BsToITrailingCase
+    go = do
+      len <- Gen.integral byteBoundRange
+      extLen <- Gen.integral byteBoundRange
+      BsToITrailingCase (BS.replicate extLen zeroBits) <$>
+        case len of
+          0 -> pure BS.empty
+          _ -> Gen.choice [pure . powerOf2 $ len, notPowerOf2 len]
+    powerOf2 :: Int -> ByteString
+    powerOf2 len = BS.cons 128 . BS.replicate (len - 1) $ zeroBits
+    notPowerOf2 :: Int -> Gen ByteString
+    notPowerOf2 len =
+      BS.cons <$> Gen.element [0 :: Word8 .. 127] <*>
+                  (Gen.bytes . Range.singleton $ len - 1)
+
+genWriteBitAgreementCase :: Gen WriteBitAgreementCase
+genWriteBitAgreementCase = do
+  len <- Gen.integral . Range.linear 1 $ 64
+  Gen.choice [same len, different len]
+  where
+    same :: Int -> Gen WriteBitAgreementCase
+    same len = do
+      let bitLen = fromIntegral $ len * 8
+      ix <- Gen.integral . indexRangeFor $ bitLen
+      pure . WriteBitReadSame len $ ix
+    different :: Int -> Gen WriteBitAgreementCase
+    different len = do
+      let bitLen = fromIntegral $ len * 8
+      readIx <- Gen.integral . indexRangeFor $ bitLen
+      writeIx <- Gen.filter (readIx /=) . Gen.integral . indexRangeFor $ bitLen
+      pure . WriteBitReadDifferent len writeIx $ readIx
+
+genCommutativeCase :: (Word8 -> Word8 -> Word8) -> Gen CommutativeCase
+genCommutativeCase f = Gen.choice [mismatched, matched]
+  where
+    mismatched :: Gen CommutativeCase
+    mismatched = do
+      leftArg <- Gen.bytes byteBoundRange
+      rightArg <- Gen.bytes byteBoundRange
+      if BS.length leftArg /= BS.length rightArg
+      then pure . MismatchedLengths leftArg $ rightArg
+      else do
+        let diff = BS.length leftArg - BS.length rightArg
+        extension <- Gen.bytes . diffRange $ diff
+        let leftArg' = leftArg <> extension
+        Gen.element [MismatchedLengths leftArg' rightArg,
+                     MismatchedLengths rightArg leftArg']
+    matched :: Gen CommutativeCase
+    matched = do
+      leftArg <- Gen.bytes byteBoundRange
+      let len = BS.length leftArg
+      rightArg <- Gen.bytes . Range.singleton $ len
+      let result = fromListN len . BS.zipWith f leftArg $ rightArg
+      pure . MatchedLengths leftArg rightArg $ result
+
+genIdentityCase :: Word8 -> Gen IdentityCase
+genIdentityCase w8 = Gen.choice [mismatched, matched]
+  where
+    mismatched :: Gen IdentityCase
+    mismatched = do
+      bs <- Gen.bytes byteBoundRange
+      let len = BS.length bs
+      genLen <- Gen.filter (/= len) . Gen.int $ byteBoundRange
+      pure . IdentityMismatched bs genLen $ w8
+    matched :: Gen IdentityCase
+    matched = do
+      bs <- Gen.bytes byteBoundRange
+      pure . IdentityMatched bs $ w8
+
+genAbsorbingCase :: Word8 -> Gen AbsorbingCase
+genAbsorbingCase w8 = Gen.choice [mismatched, matched]
+  where
+    mismatched :: Gen AbsorbingCase
+    mismatched = do
+      bs <- Gen.bytes byteBoundRange
+      let len = BS.length bs
+      genLen <- Gen.filter (/= len) . Gen.int $ byteBoundRange
+      pure . AbsorbingMismatched bs genLen $ w8
+    matched :: Gen AbsorbingCase
+    matched = do
+      bs <- Gen.bytes byteBoundRange
+      pure . AbsorbingMatched bs $ w8
+
+genAssociativeCase :: (Word8 -> Word8 -> Word8) -> Gen AssociativeCase
+genAssociativeCase f = Gen.choice [mismatched, matched]
+  where
+    mismatched :: Gen AssociativeCase
+    mismatched = do
+      x <- Gen.bytes byteBoundRange
+      y <- Gen.bytes byteBoundRange
+      z <- Gen.bytes byteBoundRange
+      if BS.length x == BS.length y && BS.length y == BS.length z
+      then do
+        extension <- Gen.bytes . diffRange $ 5
+        let x' = x <> extension
+        Gen.element [AssociativeMismatched x' y z,
+                     AssociativeMismatched y x' z,
+                     AssociativeMismatched y z x']
+      else pure . AssociativeMismatched x y $ z
+    matched :: Gen AssociativeCase
+    matched = do
+      x <- Gen.bytes byteBoundRange
+      let len = BS.length x
+      y <- Gen.bytes . Range.singleton $ len
+      z <- Gen.bytes . Range.singleton $ len
+      let result = fromListN len . zipWith f (toList x) . BS.zipWith f y $ z
+      pure . AssociativeMatched x y z $ result
+
+genBitAppendCase :: Gen BitAppendCase
+genBitAppendCase = Gen.choice [oob, inBounds1, inBounds2]
+  where
+    oob :: Gen BitAppendCase
+    oob = do
+      bs <- Gen.bytes byteBoundRange
+      bs' <- Gen.bytes byteBoundRange
+      let len = fromIntegral $ 8 * (BS.length bs + BS.length bs')
+      ix <- Gen.choice [tooLowIx len, tooHighIx len]
+      pure . AppendOutOfBounds bs bs' $ ix
+    inBounds1 :: Gen BitAppendCase
+    inBounds1 = do
+      bs <- Gen.bytes byteBoundRange
+      w8 <- Gen.enumBounded
+      let firstArg = BS.cons w8 bs
+      bs' <- Gen.bytes byteBoundRange
+      let len = fromIntegral $ 8 * BS.length firstArg
+      let len' = fromIntegral $ 8 * BS.length bs'
+      ix <- (len' +) <$> (Gen.integral . indexRangeFor $ len)
+      pure . AppendInBoundsFirst firstArg bs' $ ix
+    inBounds2 :: Gen BitAppendCase
+    inBounds2 = do
+      bs <- Gen.bytes byteBoundRange
+      bs' <- Gen.bytes byteBoundRange
+      w8 <- Gen.enumBounded
+      let secondArg = BS.cons w8 bs'
+      let len' = fromIntegral $ 8 * BS.length secondArg
+      ix <- Gen.integral . indexRangeFor $ len'
+      pure . AppendInBoundsSecond bs secondArg $ ix
+
+genWriteBitCase :: Gen WriteBitCase
+genWriteBitCase = Gen.choice [oob, inBounds]
+  where
+    oob :: Gen WriteBitCase
+    oob = do
+      bs <- Gen.bytes byteBoundRange
+      let len = fromIntegral $ 8 * BS.length bs
+      b <- Gen.enumBounded
+      ix <- Gen.choice [tooLowIx len, tooHighIx len]
+      pure . WriteBitOutOfBounds bs ix $ b
+    inBounds :: Gen WriteBitCase
+    inBounds = do
+      bs <- Gen.bytes byteBoundRange
+      w8 <- Gen.enumBounded
+      let bs' = BS.cons w8 bs
+      let len = fromIntegral $ 8 * BS.length bs'
+      b <- Gen.enumBounded
+      ix <- Gen.integral . indexRangeFor $ len
+      pure . WriteBitInBounds bs' ix $ b
+
+genFFSAppendCase :: Gen FFSAppendCase
+genFFSAppendCase = Gen.choice [allZero, secondZero, secondNonZero]
+  where
+    allZero :: Gen FFSAppendCase
+    allZero = do
+      len <- Gen.integral . Range.linear 0 $ 63
+      len' <- Gen.integral . Range.linear 0 $ 63
+      pure . FFSAppendBothZero len $ len'
+    secondZero :: Gen FFSAppendCase
+    secondZero = do
+      bs <- Gen.bytes byteBoundRange
+      w8 <- Gen.filter (/= zeroBits) Gen.enumBounded
+      let firstArg = BS.cons w8 bs
+      len' <- Gen.integral . Range.linear 0 $ 63
+      pure . FFSAppendSecondZero firstArg $ len'
+    secondNonZero :: Gen FFSAppendCase
+    secondNonZero = do
+      bs <- Gen.bytes byteBoundRange
+      w8 <- Gen.filter (/= zeroBits) Gen.enumBounded
+      bs' <- Gen.bytes byteBoundRange
+      w8' <- Gen.filter (/= zeroBits) Gen.enumBounded
+      pure . FFSAppendSecondNonZero (BS.cons w8 bs) . BS.cons w8' $ bs'
+
+tooLowIx :: Integer -> Gen Integer
+tooLowIx = Gen.integral . Range.linear (-1) . negate
+
+tooHighIx :: Integer -> Gen Integer
+tooHighIx i = Gen.integral . Range.linear i $ i * 2
+
+genSplit :: Integer -> Gen (Integer, Integer)
+genSplit ij = Gen.element $ case signum ij of
+  1 -> [(i, j) | i <- [0 .. ij], j <- [0 .. ij], i + j == ij]
+  0 -> [(0, 0)]
+  _ -> [(i, j) | i <- [0, (-1) .. ij], j <- [0, (-1) .. ij], i + j == ij]
+
+-- Ranges
+
+byteBoundRange :: Range Int
+byteBoundRange = Range.linear 0 64
+
+diffRange :: Int -> Range Int
+diffRange diff = let param = abs diff + 1 in
+  Range.linear param (param * 2)
+
+indexRange :: Range Integer
+indexRange = Range.linearFrom 0 (-100) 100
+
+indexRangeOf :: Integer -> Range Integer
+indexRangeOf lim = Range.constantFrom 0 (negate lim) (lim - 1)
+
+indexRangeFor :: Integer -> Range Integer
+indexRangeFor i = Range.constant 0 (i - 1)
+
+integerRange :: Range Integer
+integerRange = Range.linear 0 200
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs
index 91ab2302e31..8dfa4d90151 100644
--- a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs
@@ -39,12 +39,12 @@ import PlutusCore.StdLib.Data.ScottList qualified as Scott
 import PlutusCore.StdLib.Data.ScottUnit qualified as Scott
 import PlutusCore.StdLib.Data.Unit
 
+import Evaluation.Builtins.Bitwise
 import Evaluation.Builtins.BLS12_381 (test_BLS12_381)
 import Evaluation.Builtins.Common
 import Evaluation.Builtins.SignatureVerification (ecdsaSecp256k1Prop, ed25519_V1Prop,
                                                   ed25519_V2Prop, schnorrSecp256k1Prop)
 
-
 import Control.Exception
 import Data.ByteString (ByteString)
 import Data.DList qualified as DList
@@ -747,9 +747,144 @@ test_SignatureVerification =
                             "Schnorr verification behaves correctly on all inputs"
                             "schnorr_correct"
                             . property $ schnorrSecp256k1Prop
+                      ],
+        testGroup "Ed25519 signatures (V1)" [
+                       testPropertyNamed "Ed25519_V1 verification behaves correctly on all inputs" "ed25519_V1_correct" . property $ ed25519_V1Prop
+                      ],
+        testGroup "Ed25519 signatures (V2)" [
+                       testPropertyNamed "Ed25519_V2 verification behaves correctly on all inputs" "ed25519_V2_correct" . property $ ed25519_V2Prop
+                      ],
+        testGroup "Signatures on the SECP256k1 curve" [
+                       testPropertyNamed "ECDSA verification behaves correctly on all inputs" "ecdsa_correct" . property $ ecdsaSecp256k1Prop,
+                                         testPropertyNamed "Schnorr verification behaves correctly on all inputs" "schnorr_correct" . property $ schnorrSecp256k1Prop
                       ]
                 ]
 
+-- Test the bitwise builtins are behaving correctly
+test_Bitwise :: TestTree
+test_Bitwise =
+  adjustOption (\x -> max x . HedgehogTestLimit . Just $ 8000) .
+  testGroup "Bitwise operations" $ [
+    testAndByteString,
+    testIorByteString,
+    testXorByteString,
+    testComplementByteString,
+    testPopCountByteString,
+    testTestBitByteString,
+    testWriteBitByteString,
+    testFindFirstSetByteString,
+    testRotateByteString,
+    testShiftByteString,
+    testIntegerToByteString,
+    testByteStringToInteger
+  ]
+
+-- Tests for bitwise AND on ByteStrings
+testAndByteString :: TestTree
+testAndByteString = testGroup "AndByteString" [
+  testPropertyNamed "Commutativity" "and_commutes" . property $ bitwiseAndCommutes,
+  testPropertyNamed "Associativity" "and_associates" . property $ bitwiseAndAssociates,
+  testPropertyNamed "All-1s is an identity" "and_all_1_identity" . property $ bitwiseAndIdentity,
+  testPropertyNamed "All-0s is absorbing" "and_all_0_absorb" . property $ bitwiseAndAbsorbing,
+  testPropertyNamed "AND with yourself does nothing" "and_self_nothing" . property $ bitwiseAndSelf,
+  testPropertyNamed "De Morgan's law" "and_demorgan" . property $ bitwiseAndDeMorgan
+  ]
+
+-- Tests for bitwise IOR on ByteStrings
+testIorByteString :: TestTree
+testIorByteString = testGroup "IorByteString" [
+  testPropertyNamed "Commutativity" "ior_commutes" . property $ bitwiseIorCommutes,
+  testPropertyNamed "Associativity" "ior_associates" . property $ bitwiseIorAssociates,
+  testPropertyNamed "All-0s is an identity" "ior_all_0_identity" . property $ bitwiseIorIdentity,
+  testPropertyNamed "All-1s is absorbing" "ior_all_1_absorb" . property $ bitwiseIorAbsorbing,
+  testPropertyNamed "IOR with yourself does nothing" "ior_self_nothing" . property $ bitwiseIorSelf,
+  testPropertyNamed "De Morgan's law" "ior_demorgan" . property $ bitwiseIorDeMorgan
+  ]
+
+-- Tests for bitwise XOR on ByteStrings
+testXorByteString :: TestTree
+testXorByteString = testGroup "XorByteString" [
+  testPropertyNamed "Commutativity" "xor_commutes" . property $ bitwiseXorCommutes,
+  testPropertyNamed "Associativity" "xor_associates" . property $ bitwiseXorAssociates,
+  testPropertyNamed "All-0s is an identity" "xor_all_0_identity" . property $ bitwiseXorIdentity,
+  testPropertyNamed "XOR with all 1s is complement" "xor_all_1_complement" . property $ bitwiseXorComplement,
+  testPropertyNamed "XOR with yourself gives all-0" "xor_self_all_0" . property $ bitwiseXorSelf
+  ]
+
+-- Tests for bitwise complement on ByteStrings
+testComplementByteString :: TestTree
+testComplementByteString = testGroup "ComplementByteString" [
+  testPropertyNamed "Self-inversion" "complement_self_inversion" . property $ bitwiseComplementSelfInverts
+  ]
+
+-- Tests for population count on ByteStrings
+testPopCountByteString :: TestTree
+testPopCountByteString = testGroup "PopCountByteString" [
+  testCase "popcount of empty ByteString is 0" $ do
+    let arg = mkConstant @ByteString () ""
+    let comp = mkIterAppNoAnn (builtin () PopCountByteString) [ arg ]
+    typecheckEvaluateCekNoEmit def defaultBuiltinCostModel comp @?= Right (EvaluationSuccess . mkConstant @Integer () $ 0),
+  testPropertyNamed "popcount of singleton ByteString is correct" "popcount_singleton" . property $ popCountSingleByte,
+  testPropertyNamed "popcount of append is sum of popcounts" "popcount_append_sum" . property $ popCountAppend
+  ]
+
+-- Tests for bit indexing into a ByteString
+testTestBitByteString :: TestTree
+testTestBitByteString = testGroup "TestBitByteString" [
+  testPropertyNamed "any index on an empty ByteString fails" "test_bit_empty" . property $ testBitEmpty,
+  testPropertyNamed "indexing on singletons works correctly" "test_bit_singleton" . property $ testBitSingleByte,
+  testPropertyNamed "indexing appends agrees with components" "test_bit_agreement" . property $ testBitAppend
+  ]
+
+-- Tests for bit setting or clearing of a ByteString
+testWriteBitByteString :: TestTree
+testWriteBitByteString = testGroup "WriteBitByteString" [
+  testPropertyNamed "writing then reading gives back what you wrote" "write_bit_read" . property $ writeBitRead,
+  testPropertyNamed "second write wins" "write_bit_write" . property $ writeBitDouble,
+  testPropertyNamed "single write to zeroes gives right reads" "write_bit_agreement" . property $ writeBitAgreement
+  ]
+
+-- Tests for finding first set bit of a ByteString
+testFindFirstSetByteString :: TestTree
+testFindFirstSetByteString = testGroup "FindFirstSetByteString" [
+  testCase "find first set of empty Bytestring is -1" $ do
+    let arg = mkConstant @ByteString () ""
+    let comp = mkIterAppNoAnn (builtin () FindFirstSetByteString) [ arg ]
+    typecheckEvaluateCekNoEmit def defaultBuiltinCostModel comp @?= Right (EvaluationSuccess . mkConstant @Integer () $ (-1)),
+  testPropertyNamed "find first set on singletons works correctly" "ffs_singleton" . property $ ffsSingleByte,
+  testPropertyNamed "find first set on appended ByteStrings works correctly" "ffs_append" . property $ ffsAppend
+  ]
+
+-- Tests for ByteString rotations
+testRotateByteString :: TestTree
+testRotateByteString = testGroup "RotateByteString" [
+  testPropertyNamed "rotating by 0 does nothing" "rotate_0_nothing" . property $ rotateIdentity,
+  testPropertyNamed "rotation adjusts indices correctly" "rotate_adjust" . property $ rotateIndexMotion,
+  testPropertyNamed "rotating all-zero or all-one changes nothing" "rotate_homogenous" . property $ rotateHomogenous,
+  testPropertyNamed "rotating by i, then by j is the same as rotating by i + j" "rotate_sum" . property $ rotateSum
+  ]
+
+-- Tests for ByteString shifts
+testShiftByteString :: TestTree
+testShiftByteString = testGroup "ShiftByteString" [
+  testPropertyNamed "shifting by 0 does nothing" "shift_0_nothing" . property $ shiftIdentity,
+  testPropertyNamed "shifting adjusts indices correctly" "shift_adjust" . property $ shiftIndexMotion,
+  testPropertyNamed "shifting all-zeroes does nothing" "shift_homogenous" . property $ shiftHomogenous,
+  testPropertyNamed "shifting in two steps is the same as shifting in one" "shift_sum" . property $ shiftSum
+  ]
+
+-- Tests for conversion into ByteString from Integer
+testIntegerToByteString :: TestTree
+testIntegerToByteString = testGroup "IntegerToByteString" [
+  testPropertyNamed "Round trip" "i_to_bs_roundtrip" . property $ iToBsRoundtrip
+  ]
+
+-- Tests for conversion into Integer from ByteString
+testByteStringToInteger :: TestTree
+testByteStringToInteger = testGroup "ByteStringToInteger" [
+  testPropertyNamed "trailing zeros ignored" "bs_to_i_trailing" . property $ bsToITrailing
+  ]
+
 test_definition :: TestTree
 test_definition =
     testGroup "definition"
@@ -778,6 +913,7 @@ test_definition =
         , test_Data
         , test_Crypto
         , test_HashSizes
+        , test_Bitwise
         , test_SignatureVerification
         , test_BLS12_381
         , test_Other
diff --git a/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs b/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs
index f2552fe4cfc..9337574390f 100644
--- a/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs
+++ b/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs
@@ -109,7 +109,12 @@ builtinsIntroducedIn = Map.fromList [
           Bls12_381_G2_equal, Bls12_381_G2_hashToGroup,
           Bls12_381_G2_compress, Bls12_381_G2_uncompress,
           Bls12_381_millerLoop, Bls12_381_mulMlResult, Bls12_381_finalVerify,
-          Keccak_256, Blake2b_224
+          Keccak_256, Blake2b_224,
+          IntegerToByteString, ByteStringToInteger,
+          AndByteString, IorByteString, XorByteString, ComplementByteString,
+          ShiftByteString, RotateByteString,
+          TestBitByteString, WriteBitByteString,
+          PopCountByteString, FindFirstSetByteString
           ])
   ]
 
diff --git a/plutus-tx/src/PlutusTx/Builtins.hs b/plutus-tx/src/PlutusTx/Builtins.hs
index bd708e4afe8..0c9be5ef95d 100644
--- a/plutus-tx/src/PlutusTx/Builtins.hs
+++ b/plutus-tx/src/PlutusTx/Builtins.hs
@@ -4,8 +4,8 @@
 
 -- | Primitive names and functions for working with Plutus Core builtins.
 module PlutusTx.Builtins (
-                          -- * Bytestring builtins
-                          BuiltinByteString
+                         -- * Bytestring builtins
+                           BuiltinByteString
                          , appendByteString
                          , consByteString
                          , sliceByteString
@@ -96,6 +96,19 @@ module PlutusTx.Builtins (
                          , bls12_381_millerLoop
                          , bls12_381_mulMlResult
                          , bls12_381_finalVerify
+                         -- * Bitwise builtins
+                         , integerToByteString
+                         , byteStringToInteger
+                         , andByteString
+                         , iorByteString
+                         , xorByteString
+                         , complementByteString
+                         , shiftByteString
+                         , rotateByteString
+                         , popCountByteString
+                         , testBitByteString
+                         , writeBitByteString
+                         , findFirstSetByteString
                          -- * Conversions
                          , fromBuiltin
                          , toBuiltin
@@ -283,6 +296,90 @@ verifySchnorrSecp256k1Signature
 verifySchnorrSecp256k1Signature vk msg sig =
   fromBuiltin (BI.verifySchnorrSecp256k1Signature vk msg sig)
 
+-- | Converts a non-negative 'Integer' into its base-256 'BuiltinByteString' representation.
+-- The format is little-endian, i.e. the first byte is the least significant.
+-- The inverse of this is 'byteStringToInteger'.
+-- The output does not contain any trailing zero-bytes, hence zeros are empty bytestrings.
+-- If the input is negative, this function errs.
+{-# INLINEABLE integerToByteString #-}
+integerToByteString :: Integer -> BuiltinByteString
+integerToByteString i = BI.integerToByteString (toBuiltin i)
+
+-- | Converts a base-256 'BuiltinByteString' into its 'Integer' representation.
+-- The format is little-endian, i.e. the first byte is the least significant.
+-- The inverse of this is 'integerToByteString'.
+-- The input can contain trailing zero-bytes.
+{-# INLINEABLE byteStringToInteger #-}
+byteStringToInteger :: BuiltinByteString -> Integer
+byteStringToInteger bs = fromBuiltin (BI.byteStringToInteger bs)
+
+-- | If given bytestrings of equal length, constructs their bitwise logical
+-- AND, erring otherwise.
+{-# INLINEABLE andByteString #-}
+andByteString :: BuiltinByteString -> BuiltinByteString -> BuiltinByteString
+andByteString = BI.andByteString
+
+-- | If given bytestrings of equal length, constructs their bitwise logical
+-- OR, erring otherwise.
+{-# INLINEABLE iorByteString #-}
+iorByteString :: BuiltinByteString -> BuiltinByteString -> BuiltinByteString
+iorByteString = BI.iorByteString
+
+-- | If given bytestrings of equal length, constructs their bitwise logical
+-- XOR, erroring otherwise.
+{-# INLINEABLE xorByteString #-}
+xorByteString :: BuiltinByteString -> BuiltinByteString -> BuiltinByteString
+xorByteString = BI.xorByteString
+
+-- | If given bytestrings of equal length, constructs the flipped bytestring,
+-- i.e. each bit is flipped.
+{-# INLINEABLE complementByteString #-}
+complementByteString :: BuiltinByteString -> BuiltinByteString
+complementByteString = BI.complementByteString
+
+-- | Shifts the input bytestring left by the specified (possibly negative) amount.
+-- If positive, shifts left/to higher significance.
+-- If negative, shifts right/to lower significance.
+-- The shift is **not** arithmetic. You can emulate an arithmetic
+-- shift by OR-ing with what is morally -1 left-shifted the appropriate amount.
+-- The output is not trimmed, hence trailing zero-bytes may remain.
+{-# INLINEABLE shiftByteString #-}
+shiftByteString :: BuiltinByteString -> Integer -> BuiltinByteString
+shiftByteString bs i = BI.shiftByteString bs (toBuiltin i)
+
+-- | Rotates the input bytestring left by the specified (possibly negative) amount.
+-- If positive, rotates left/to higher significance.
+-- If negative, rotates right/to lower significance.
+{-# INLINEABLE rotateByteString #-}
+rotateByteString :: BuiltinByteString -> Integer -> BuiltinByteString
+rotateByteString bs i = BI.rotateByteString bs (toBuiltin i)
+
+-- | Counts the number of 1 bits in the argument.
+{-# INLINEABLE popCountByteString #-}
+popCountByteString :: BuiltinByteString -> Integer
+popCountByteString bs = fromBuiltin (BI.popCountByteString bs)
+
+-- | Bitwise indexing operation. Errs when given an index that's not
+-- in-bounds: specifically, indices that are either negative or greater than or
+-- equal to the number of bits in the 'BuiltinByteString' argument.
+{-# INLINEABLE testBitByteString #-}
+testBitByteString :: BuiltinByteString -> Integer -> Bool
+testBitByteString bs i = fromBuiltin (BI.testBitByteString bs (toBuiltin i))
+
+-- | Bitwise modification at an index. Errs when given an index that's not
+-- in-bounds: specifically, indices that are either negative or greater than
+-- or equal to the number of bits in the 'BuiltinByteString' argument.
+{-# INLINEABLE writeBitByteString #-}
+writeBitByteString :: BuiltinByteString -> Integer -> Bool -> BuiltinByteString
+writeBitByteString bs i b = BI.writeBitByteString bs (toBuiltin i) (toBuiltin b)
+
+-- | Finds the lowest bit index such that 'testBitByteString' at that index is
+-- 'True'. Returns @-1@ if no such index exists: that is, the
+-- 'BuiltinByteString' argument has only zero bytes in it, or is empty.
+{-# INLINEABLE findFirstSetByteString #-}
+findFirstSetByteString :: BuiltinByteString -> Integer
+findFirstSetByteString bs = fromBuiltin (BI.findFirstSetByteString bs)
+
 {-# INLINABLE addInteger #-}
 -- | Add two 'Integer's.
 addInteger :: Integer -> Integer -> Integer
diff --git a/plutus-tx/src/PlutusTx/Builtins/Internal.hs b/plutus-tx/src/PlutusTx/Builtins/Internal.hs
index 9a5cf7f17a8..fa1bdebbc6d 100644
--- a/plutus-tx/src/PlutusTx/Builtins/Internal.hs
+++ b/plutus-tx/src/PlutusTx/Builtins/Internal.hs
@@ -14,6 +14,7 @@
 -- Most users should not use this module directly, but rather use 'PlutusTx.Builtins'.
 module PlutusTx.Builtins.Internal where
 
+import Bitwise qualified
 import Codec.Serialise
 import Control.DeepSeq (NFData (..))
 import Control.Monad.Trans.Writer.Strict (runWriter)
@@ -25,6 +26,7 @@ import Data.Data
 import Data.Foldable qualified as Foldable
 import Data.Hashable (Hashable (..))
 import Data.Kind (Type)
+import Data.Maybe (fromJust)
 import Data.Text as Text (Text, empty)
 import Data.Text.Encoding as Text (decodeUtf8, encodeUtf8)
 import PlutusCore.Builtin.Emitter (Emitter (Emitter))
@@ -306,6 +308,86 @@ lessThanEqualsByteString (BuiltinByteString b1) (BuiltinByteString b2) = Builtin
 decodeUtf8 :: BuiltinByteString -> BuiltinString
 decodeUtf8 (BuiltinByteString b) = BuiltinString $ Text.decodeUtf8 b
 
+{-# NOINLINE integerToByteString #-}
+integerToByteString :: BuiltinInteger -> BuiltinByteString
+integerToByteString = BuiltinByteString . fromJust . Bitwise.integerToByteString
+
+{-# NOINLINE byteStringToInteger #-}
+byteStringToInteger :: BuiltinByteString -> BuiltinInteger
+byteStringToInteger (BuiltinByteString bs) = Bitwise.byteStringToInteger bs
+
+{-# NOINLINE andByteString #-}
+andByteString :: BuiltinByteString -> BuiltinByteString -> BuiltinByteString
+andByteString (BuiltinByteString bs) (BuiltinByteString bs') =
+  case Bitwise.andByteString bs bs' of
+    Emitter f -> case runWriter f of
+      (res, logs) -> traceAll logs $ case res of
+        EvaluationFailure      -> mustBeReplaced "Bitwise AND errored."
+        EvaluationSuccess bs'' -> BuiltinByteString bs''
+
+{-# NOINLINE iorByteString #-}
+iorByteString :: BuiltinByteString -> BuiltinByteString -> BuiltinByteString
+iorByteString (BuiltinByteString bs) (BuiltinByteString bs') =
+  case Bitwise.iorByteString bs bs' of
+    Emitter f -> case runWriter f of
+      (res, logs) -> traceAll logs $ case res of
+        EvaluationFailure      -> mustBeReplaced "Bitwise IOR errored."
+        EvaluationSuccess bs'' -> BuiltinByteString bs''
+
+{-# NOINLINE xorByteString #-}
+xorByteString :: BuiltinByteString -> BuiltinByteString -> BuiltinByteString
+xorByteString (BuiltinByteString bs) (BuiltinByteString bs') =
+  case Bitwise.xorByteString bs bs' of
+    Emitter f -> case runWriter f of
+      (res, logs) -> traceAll logs $ case res of
+        EvaluationFailure      -> mustBeReplaced "Bitwise XOR errored."
+        EvaluationSuccess bs'' -> BuiltinByteString bs''
+
+{-# NOINLINE complementByteString #-}
+complementByteString :: BuiltinByteString -> BuiltinByteString
+complementByteString (BuiltinByteString bs) =
+  BuiltinByteString . Bitwise.complementByteString $ bs
+
+{-# NOINLINE shiftByteString #-}
+shiftByteString :: BuiltinByteString -> BuiltinInteger -> BuiltinByteString
+shiftByteString (BuiltinByteString bs) =
+  BuiltinByteString . Bitwise.shiftByteString bs
+
+{-# NOINLINE rotateByteString #-}
+rotateByteString :: BuiltinByteString -> BuiltinInteger -> BuiltinByteString
+rotateByteString (BuiltinByteString bs) =
+  BuiltinByteString . Bitwise.rotateByteString bs
+
+{-# NOINLINE popCountByteString #-}
+popCountByteString :: BuiltinByteString -> BuiltinInteger
+popCountByteString (BuiltinByteString bs) = Bitwise.popCountByteString bs
+
+{-# NOINLINE testBitByteString #-}
+testBitByteString :: BuiltinByteString -> BuiltinInteger -> BuiltinBool
+testBitByteString (BuiltinByteString bs) i =
+  case Bitwise.testBitByteString bs i of
+    Emitter f -> case runWriter f of
+      (res, logs) -> traceAll logs $ case res of
+        EvaluationFailure   -> mustBeReplaced "Bitwise indexing errored."
+        EvaluationSuccess b -> BuiltinBool b
+
+{-# NOINLINE writeBitByteString #-}
+writeBitByteString ::
+  BuiltinByteString ->
+  BuiltinInteger ->
+  BuiltinBool ->
+  BuiltinByteString
+writeBitByteString (BuiltinByteString bs) i (BuiltinBool b) =
+  case Bitwise.writeBitByteString bs i b of
+    Emitter f -> case runWriter f of
+      (res, logs) -> traceAll logs $ case res of
+        EvaluationFailure     -> mustBeReplaced "Bitwise indexed write errored."
+        EvaluationSuccess bs' -> BuiltinByteString bs'
+
+{-# NOINLINE findFirstSetByteString #-}
+findFirstSetByteString :: BuiltinByteString -> BuiltinInteger
+findFirstSetByteString (BuiltinByteString bs) = Bitwise.popCountByteString bs
+
 {-
 STRING
 -}