From db859fc26b8eaa5e83f2771c4ba257ea1662e501 Mon Sep 17 00:00:00 2001
From: Xinding Wei <weixinding@gmail.com>
Date: Wed, 22 Jan 2025 22:49:10 +0800
Subject: [PATCH 1/6] Refactor FriReducedOpeningChip

---
 Cargo.lock                                    |   1 +
 crates/sdk/tests/integration_test.rs          |   4 +-
 crates/vm/src/arch/config.rs                  |   7 +-
 crates/vm/src/arch/testing/mod.rs             |  12 +
 crates/vm/src/utils/stark_utils.rs            |   8 +-
 crates/vm/tests/integration_test.rs           |  22 +-
 extensions/native/circuit/Cargo.toml          |   1 +
 extensions/native/circuit/src/extension.rs    |   4 +-
 extensions/native/circuit/src/fri/mod.rs      | 898 ++++++++++--------
 extensions/native/circuit/src/fri/tests.rs    |  50 +-
 extensions/native/circuit/src/utils.rs        |   3 +-
 .../native/compiler/src/asm/compiler.rs       |   3 +-
 .../native/compiler/src/asm/instruction.rs    |  10 +-
 .../native/compiler/src/conversion/mod.rs     |   4 +-
 extensions/native/compiler/src/ir/fri.rs      |   2 -
 .../native/compiler/src/ir/instructions.rs    |   3 +-
 .../native/compiler/tests/fri_ro_eval.rs      |   9 +-
 .../native/recursion/src/fri/two_adic_pcs.rs  |   9 +-
 .../native/recursion/src/testing_utils.rs     |  11 +-
 19 files changed, 614 insertions(+), 447 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e5ce8a1f1e..9706315d33 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3952,6 +3952,7 @@ dependencies = [
  "rand",
  "serde",
  "serde-big-array",
+ "static_assertions",
  "strum",
  "test-case",
  "test-log",
diff --git a/crates/sdk/tests/integration_test.rs b/crates/sdk/tests/integration_test.rs
index 7fa7cd7686..ef4489a633 100644
--- a/crates/sdk/tests/integration_test.rs
+++ b/crates/sdk/tests/integration_test.rs
@@ -307,7 +307,7 @@ fn test_static_verifier_custom_pv_handler() {
 
     // Test setup
     println!("test setup");
-    let app_log_blowup = 1;
+    let app_log_blowup = 2;
     let app_config = small_test_app_config(app_log_blowup);
     let app_pk = Sdk.app_keygen(app_config.clone()).unwrap();
     let app_committed_exe = app_committed_exe_for_test(app_log_blowup);
@@ -358,7 +358,7 @@ fn test_static_verifier_custom_pv_handler() {
 
 #[test]
 fn test_e2e_proof_generation_and_verification() {
-    let app_log_blowup = 1;
+    let app_log_blowup = 2;
     let app_config = small_test_app_config(app_log_blowup);
     let app_pk = Sdk.app_keygen(app_config).unwrap();
     let params_reader = CacheHalo2ParamsReader::new_with_default_params_dir();
diff --git a/crates/vm/src/arch/config.rs b/crates/vm/src/arch/config.rs
index 2aa24a89c4..bc690b3fb5 100644
--- a/crates/vm/src/arch/config.rs
+++ b/crates/vm/src/arch/config.rs
@@ -19,9 +19,8 @@ use super::{
 use crate::system::memory::BOUNDARY_AIR_OFFSET;
 
 const DEFAULT_MAX_SEGMENT_LEN: usize = (1 << 22) - 100;
-// sbox is decomposed to have this max degree for Poseidon2. We set to 3 so quotient_degree = 2
-// allows log_blowup = 1
-const DEFAULT_POSEIDON2_MAX_CONSTRAINT_DEGREE: usize = 3;
+// FRI Reduced Opening Chip has degree = 4.
+const DEFAULT_MAX_CONSTRAINT_DEGREE: usize = 4;
 /// Width of Poseidon2 VM uses.
 pub const POSEIDON2_WIDTH: usize = 16;
 /// Returns a Poseidon2 config for the VM.
@@ -168,7 +167,7 @@ impl SystemConfig {
 impl Default for SystemConfig {
     fn default() -> Self {
         Self::new(
-            DEFAULT_POSEIDON2_MAX_CONSTRAINT_DEGREE,
+            DEFAULT_MAX_CONSTRAINT_DEGREE,
             Default::default(),
             DEFAULT_MAX_NUM_PUBLIC_VALUES,
         )
diff --git a/crates/vm/src/arch/testing/mod.rs b/crates/vm/src/arch/testing/mod.rs
index a6177a0a09..47dfa3b579 100644
--- a/crates/vm/src/arch/testing/mod.rs
+++ b/crates/vm/src/arch/testing/mod.rs
@@ -256,6 +256,18 @@ impl VmChipTestBuilder<BabyBear> {
         let tester = tester.load(self.execution);
         tester.load(self.program)
     }
+    pub fn build_babybear_poseidon2(self) -> VmChipTester<BabyBearPoseidon2Config> {
+        self.memory
+            .controller
+            .borrow_mut()
+            .finalize(None::<&mut Poseidon2PeripheryChip<BabyBear>>);
+        let tester = VmChipTester {
+            memory: Some(self.memory),
+            ..Default::default()
+        };
+        let tester = tester.load(self.execution);
+        tester.load(self.program)
+    }
 }
 
 impl<F: PrimeField32> Default for VmChipTestBuilder<F> {
diff --git a/crates/vm/src/utils/stark_utils.rs b/crates/vm/src/utils/stark_utils.rs
index fbbdc35297..2a0bf1ab98 100644
--- a/crates/vm/src/utils/stark_utils.rs
+++ b/crates/vm/src/utils/stark_utils.rs
@@ -41,7 +41,13 @@ where
     VC::Periphery: Chip<BabyBearPoseidon2Config>,
 {
     setup_tracing();
-    let engine = BabyBearPoseidon2Engine::new(FriParameters::standard_fast());
+    let mut log_blowup = 1;
+    while config.system().max_constraint_degree > (1 << log_blowup) + 1 {
+        log_blowup += 1;
+    }
+    let engine = BabyBearPoseidon2Engine::new(
+        FriParameters::standard_with_100_bits_conjectured_security(log_blowup),
+    );
     let vm = VirtualMachine::new(engine, config);
     let pk = vm.keygen();
     let mut result = vm.execute_and_generate(exe, input).unwrap();
diff --git a/crates/vm/tests/integration_test.rs b/crates/vm/tests/integration_test.rs
index 0468806ba2..ca3f40c034 100644
--- a/crates/vm/tests/integration_test.rs
+++ b/crates/vm/tests/integration_test.rs
@@ -124,7 +124,7 @@ fn test_vm_override_executor_height() {
     ));
 
     // Test getting heights.
-    let vm_config = NativeConfig::aggregation(8, 3);
+    let vm_config = NativeConfig::aggregation(8, 5);
 
     let executor = SingleSegmentVmExecutor::new(vm_config.clone());
     let res = executor
@@ -211,7 +211,7 @@ fn test_vm_override_executor_height() {
 fn test_vm_1_optional_air() {
     // Aggregation VmConfig has Core/Poseidon2/FieldArithmetic/FieldExtension chips. The program only
     // uses Core and FieldArithmetic. All other chips should not have AIR proof inputs.
-    let config = NativeConfig::aggregation(4, 3);
+    let config = NativeConfig::aggregation(4, 5);
     let engine =
         BabyBearPoseidon2Engine::new(standard_fri_params_with_100_bits_conjectured_security(3));
     let vm = VirtualMachine::new(engine, config);
@@ -319,7 +319,7 @@ fn test_vm_initial_memory() {
         .into_iter()
         .collect();
 
-    let config = NativeConfig::aggregation(0, 3).with_continuations();
+    let config = NativeConfig::aggregation(0, 5).with_continuations();
     let exe = VmExe {
         program,
         pc_start: 0,
@@ -331,9 +331,11 @@ fn test_vm_initial_memory() {
 
 #[test]
 fn test_vm_1_persistent() {
-    let engine = BabyBearPoseidon2Engine::new(FriParameters::standard_fast());
+    let engine = BabyBearPoseidon2Engine::new(
+        FriParameters::standard_with_100_bits_conjectured_security(2),
+    );
     let config = NativeConfig {
-        system: SystemConfig::new(3, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0),
+        system: SystemConfig::new(5, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0),
         native: Default::default(),
     }
     .with_continuations();
@@ -443,7 +445,7 @@ fn test_vm_continuations() {
     let n = 200000;
     let program = gen_continuation_test_program(n);
     let config = NativeConfig {
-        system: SystemConfig::new(3, MemoryConfig::default(), 0).with_max_segment_len(200000),
+        system: SystemConfig::new(5, MemoryConfig::default(), 0).with_max_segment_len(200000),
         native: Default::default(),
     }
     .with_continuations();
@@ -473,11 +475,13 @@ fn test_vm_continuations_recover_state() {
     let n = 2000;
     let program = gen_continuation_test_program(n);
     let config = NativeConfig {
-        system: SystemConfig::new(3, MemoryConfig::default(), 0).with_max_segment_len(500),
+        system: SystemConfig::new(5, MemoryConfig::default(), 0).with_max_segment_len(500),
         native: Default::default(),
     }
     .with_continuations();
-    let engine = BabyBearPoseidon2Engine::new(FriParameters::standard_fast());
+    let engine = BabyBearPoseidon2Engine::new(
+        FriParameters::standard_with_100_bits_conjectured_security(2),
+    );
     let vm = VirtualMachine::new(engine, config.clone());
     let pk = vm.keygen();
     let segments = vm
@@ -729,7 +733,7 @@ fn test_vm_field_extension_arithmetic_persistent() {
 
     let program = Program::from_instructions(&instructions);
     let config = NativeConfig {
-        system: SystemConfig::new(3, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0)
+        system: SystemConfig::new(5, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0)
             .with_continuations(),
         native: Default::default(),
     };
diff --git a/extensions/native/circuit/Cargo.toml b/extensions/native/circuit/Cargo.toml
index 1bb5a04676..8dc8fcd0c8 100644
--- a/extensions/native/circuit/Cargo.toml
+++ b/extensions/native/circuit/Cargo.toml
@@ -30,6 +30,7 @@ eyre.workspace = true
 serde.workspace = true
 serde-big-array.workspace = true
 bitcode.workspace = true
+static_assertions.workspace = true
 
 [dev-dependencies]
 openvm-stark-sdk = { workspace = true }
diff --git a/extensions/native/circuit/src/extension.rs b/extensions/native/circuit/src/extension.rs
index 91a91cd8a3..fee30c886c 100644
--- a/extensions/native/circuit/src/extension.rs
+++ b/extensions/native/circuit/src/extension.rs
@@ -58,10 +58,10 @@ impl NativeConfig {
         self
     }
 
-    pub fn aggregation(num_public_values: usize, poseidon2_max_constraint_degree: usize) -> Self {
+    pub fn aggregation(num_public_values: usize, max_constraint_degree: usize) -> Self {
         Self {
             system: SystemConfig::new(
-                poseidon2_max_constraint_degree,
+                max_constraint_degree,
                 MemoryConfig {
                     max_access_adapter_n: 8,
                     ..Default::default()
diff --git a/extensions/native/circuit/src/fri/mod.rs b/extensions/native/circuit/src/fri/mod.rs
index 7b757d06fd..fb3792996e 100644
--- a/extensions/native/circuit/src/fri/mod.rs
+++ b/extensions/native/circuit/src/fri/mod.rs
@@ -1,26 +1,23 @@
+use core::ops::Deref;
 use std::{
-    array::{self, from_fn},
+    array,
     borrow::{Borrow, BorrowMut},
+    mem::offset_of,
     sync::{Arc, Mutex},
 };
 
+use itertools::{zip_eq, Itertools};
 use openvm_circuit::{
     arch::{ExecutionBridge, ExecutionBus, ExecutionError, ExecutionState, InstructionExecutor},
     system::{
         memory::{
-            offline_checker::{
-                MemoryBaseAuxCols, MemoryBridge, MemoryReadAuxCols, MemoryWriteAuxCols,
-            },
+            offline_checker::{MemoryBridge, MemoryReadAuxCols, MemoryWriteAuxCols},
             MemoryAddress, MemoryAuxColsFactory, MemoryController, OfflineMemory, RecordId,
         },
         program::ProgramBus,
     },
 };
-use openvm_circuit_primitives::{
-    is_zero::{IsZeroIo, IsZeroSubAir},
-    utils::{assert_array_eq, next_power_of_two_or_zero, not},
-    SubAir, TraceSubRowGenerator,
-};
+use openvm_circuit_primitives::utils::next_power_of_two_or_zero;
 use openvm_circuit_primitives_derive::AlignedBorrow;
 use openvm_instructions::{instruction::Instruction, program::DEFAULT_PC_STEP, LocalOpcode};
 use openvm_native_compiler::FriOpcode::FRI_REDUCED_OPENING;
@@ -36,264 +33,399 @@ use openvm_stark_backend::{
     Chip, ChipUsageGetter, Stateful,
 };
 use serde::{Deserialize, Serialize};
+use static_assertions::const_assert_eq;
 
-use super::field_extension::{FieldExtension, EXT_DEG};
+use crate::field_extension::{FieldExtension, EXT_DEG};
 
 #[cfg(test)]
 mod tests;
 
 #[repr(C)]
-#[derive(AlignedBorrow)]
-pub struct FriReducedOpeningCols<T> {
-    pub enabled: T,
-
-    pub pc: T,
-    pub start_timestamp: T,
-
-    pub a_ptr_ptr: T,
-    pub b_ptr_ptr: T,
-    pub result_ptr: T,
-    pub addr_space: T,
-    pub length_ptr: T,
-    pub alpha_ptr: T,
-    pub alpha_pow_ptr: T,
-
-    pub a_ptr_aux: MemoryReadAuxCols<T>,
-    pub b_ptr_aux: MemoryReadAuxCols<T>,
-    pub a_aux: MemoryReadAuxCols<T>,
-    pub b_aux: MemoryReadAuxCols<T>,
-    pub result_aux: MemoryWriteAuxCols<T, EXT_DEG>,
-    pub length_aux: MemoryReadAuxCols<T>,
-    pub alpha_aux: MemoryReadAuxCols<T>,
-    pub alpha_pow_aux: MemoryBaseAuxCols<T>,
-
-    pub a_ptr: T,
-    pub b_ptr: T,
-    pub a: T,
-    pub b: [T; EXT_DEG],
-    pub alpha: [T; EXT_DEG],
-    pub alpha_pow_original: [T; EXT_DEG],
-    pub alpha_pow_current: [T; EXT_DEG],
-
-    pub idx: T,
-    pub idx_is_zero: T,
-    pub is_zero_aux: T,
-    pub current: [T; EXT_DEG],
+#[derive(Debug, AlignedBorrow)]
+struct WorkloadCols<T> {
+    prefix: PrefixCols<T>,
+
+    a_aux: MemoryReadAuxCols<T>,
+    b: [T; EXT_DEG],
+    b_aux: MemoryReadAuxCols<T>,
+}
+const WL_WIDTH: usize = WorkloadCols::<u8>::width();
+const_assert_eq!(WL_WIDTH, 26);
+
+#[repr(C)]
+#[derive(Debug, AlignedBorrow)]
+struct Instruction1Cols<T> {
+    prefix: PrefixCols<T>,
+
+    pc: T,
+
+    a_ptr_ptr: T,
+    a_ptr_aux: MemoryReadAuxCols<T>,
+
+    b_ptr_ptr: T,
+    b_ptr_aux: MemoryReadAuxCols<T>,
+}
+const INS_1_WIDTH: usize = Instruction1Cols::<u8>::width();
+const_assert_eq!(INS_1_WIDTH, 25);
+const_assert_eq!(
+    offset_of!(WorkloadCols<u8>, prefix),
+    offset_of!(Instruction1Cols<u8>, prefix)
+);
+
+#[repr(C)]
+#[derive(Debug, AlignedBorrow)]
+struct Instruction2Cols<T> {
+    general: GeneralCols<T>,
+    // is_first = 0 means the second instruction row.
+    is_first: T,
+
+    result_ptr: T,
+    result_aux: MemoryWriteAuxCols<T, EXT_DEG>,
+
+    length_ptr: T,
+    length_aux: MemoryReadAuxCols<T>,
+
+    alpha_ptr: T,
+    alpha_aux: MemoryReadAuxCols<T>,
+}
+const INS_2_WIDTH: usize = Instruction2Cols::<u8>::width();
+const_assert_eq!(INS_2_WIDTH, 20);
+const_assert_eq!(
+    offset_of!(WorkloadCols<u8>, prefix) + offset_of!(PrefixCols<u8>, general),
+    offset_of!(Instruction2Cols<u8>, general)
+);
+const_assert_eq!(
+    offset_of!(Instruction1Cols<u8>, prefix) + offset_of!(PrefixCols<u8>, a_or_is_first),
+    offset_of!(Instruction2Cols<u8>, is_first)
+);
+
+const fn const_max(a: usize, b: usize) -> usize {
+    [a, b][(a < b) as usize]
+}
+pub const OVERALL_WIDTH: usize = const_max(const_max(WL_WIDTH, INS_1_WIDTH), INS_2_WIDTH);
+const_assert_eq!(OVERALL_WIDTH, 26);
+
+#[repr(C)]
+#[derive(Debug, AlignedBorrow)]
+struct GeneralCols<T> {
+    enabled: T,
+    is_ins_row: T,
+    timestamp: T,
+}
+const GENERAL_WIDTH: usize = GeneralCols::<u8>::width();
+const_assert_eq!(GENERAL_WIDTH, 3);
+
+#[repr(C)]
+#[derive(Debug, AlignedBorrow)]
+struct DataCols<T> {
+    addr_space: T,
+    a_ptr: T,
+    b_ptr: T,
+    idx: T,
+    result: [T; EXT_DEG],
+    alpha: [T; EXT_DEG],
 }
+#[allow(dead_code)]
+const DATA_WIDTH: usize = DataCols::<u8>::width();
+const_assert_eq!(DATA_WIDTH, 12);
+
+/// Prefix of `WorkloadCols` and `Instruction1Cols`
+#[repr(C)]
+#[derive(Debug, AlignedBorrow)]
+struct PrefixCols<T> {
+    general: GeneralCols<T>,
+    /// WorkloadCols uses this column as `a`. Instruction1Cols uses this column as `is_first` which
+    /// indicates whether this is the first row of an instruction row. This is to save a column.
+    a_or_is_first: T,
+    data: DataCols<T>,
+}
+const PREFIX_WIDTH: usize = PrefixCols::<u8>::width();
+const_assert_eq!(PREFIX_WIDTH, 16);
 
 #[derive(Copy, Clone, Debug)]
-pub struct FriReducedOpeningAir {
-    pub execution_bridge: ExecutionBridge,
-    pub memory_bridge: MemoryBridge,
+struct FriReducedOpeningAir {
+    execution_bridge: ExecutionBridge,
+    memory_bridge: MemoryBridge,
 }
 
 impl<F: Field> BaseAir<F> for FriReducedOpeningAir {
     fn width(&self) -> usize {
-        FriReducedOpeningCols::<F>::width()
+        OVERALL_WIDTH
     }
 }
 
 impl<F: Field> BaseAirWithPublicValues<F> for FriReducedOpeningAir {}
 impl<F: Field> PartitionedBaseAir<F> for FriReducedOpeningAir {}
-
 impl<AB: InteractionBuilder> Air<AB> for FriReducedOpeningAir {
     fn eval(&self, builder: &mut AB) {
         let main = builder.main();
         let local = main.row_slice(0);
-        let local: &FriReducedOpeningCols<AB::Var> = (*local).borrow();
         let next = main.row_slice(1);
-        let next: &FriReducedOpeningCols<AB::Var> = (*next).borrow();
+        let local_slice = local.deref();
+        let next_slice = next.deref();
+        self.eval_general(builder, local_slice, next_slice);
+        self.eval_workload_row(builder, local_slice, next_slice);
+        self.eval_instruction1_row(builder, local_slice, next_slice);
+        self.eval_instruction2_row(builder, local_slice, next_slice);
+    }
+}
 
-        let &FriReducedOpeningCols {
-            enabled,
-            pc,
-            start_timestamp,
-            a_ptr_ptr,
-            b_ptr_ptr,
-            result_ptr,
-            addr_space,
-            length_ptr,
-            alpha_ptr,
-            alpha_pow_ptr,
-            a_ptr,
-            b_ptr,
-            a,
-            b,
-            alpha,
-            alpha_pow_original,
-            alpha_pow_current,
-            idx,
-            idx_is_zero,
-            is_zero_aux,
-            current,
-            a_ptr_aux,
-            b_ptr_aux,
-            a_aux,
-            b_aux,
-            result_aux,
-            length_aux,
-            alpha_aux,
-            alpha_pow_aux,
-        } = local;
-
-        let is_first = idx_is_zero;
-        let is_last = next.idx_is_zero;
-
-        builder.assert_bool(enabled);
-        // transition constraints
-        let mut when_is_not_last = builder.when(not(is_last));
-
-        let next_alpha_pow_times_b = FieldExtension::multiply(next.alpha_pow_current, next.b);
-        for i in 0..EXT_DEG {
-            when_is_not_last.assert_eq(
-                next.current[i],
-                next_alpha_pow_times_b[i].clone() - (next.alpha_pow_current[i] * next.a)
-                    + current[i],
-            );
+impl FriReducedOpeningAir {
+    fn eval_general<AB: InteractionBuilder>(
+        &self,
+        builder: &mut AB,
+        local_slice: &[AB::Var],
+        next_slice: &[AB::Var],
+    ) {
+        let local: &GeneralCols<AB::Var> = local_slice[..GENERAL_WIDTH].borrow();
+        let next: &GeneralCols<AB::Var> = next_slice[..GENERAL_WIDTH].borrow();
+        builder.assert_bool(local.enabled);
+        builder.assert_bool(local.is_ins_row);
+        {
+            // All enabled rows must be before disabled rows.
+            let mut when_transition = builder.when_transition();
+            let mut when_disabled = when_transition.when_ne(local.enabled, AB::Expr::ONE);
+            when_disabled.assert_zero(next.enabled);
         }
+    }
 
-        assert_array_eq(&mut when_is_not_last, next.alpha, alpha);
-        assert_array_eq(
-            &mut when_is_not_last,
-            next.alpha_pow_original,
-            alpha_pow_original,
-        );
-        assert_array_eq(
-            &mut when_is_not_last,
-            next.alpha_pow_current,
-            FieldExtension::multiply(alpha, alpha_pow_current),
-        );
-        when_is_not_last.assert_eq(next.idx, idx + AB::Expr::ONE);
-        when_is_not_last.assert_eq(next.enabled, enabled);
-        when_is_not_last.assert_eq(next.start_timestamp, start_timestamp);
-
-        // first row constraint
-        assert_array_eq(
-            &mut builder.when(is_first),
-            alpha_pow_current,
-            alpha_pow_original,
-        );
-
-        let alpha_pow_times_b = FieldExtension::multiply(alpha_pow_current, b);
-        for i in 0..EXT_DEG {
-            builder.when(is_first).assert_eq(
-                current[i],
-                alpha_pow_times_b[i].clone() - (alpha_pow_current[i] * a),
+    fn eval_workload_row<AB: InteractionBuilder>(
+        &self,
+        builder: &mut AB,
+        local_slice: &[AB::Var],
+        next_slice: &[AB::Var],
+    ) {
+        let local: &WorkloadCols<AB::Var> = local_slice[..WL_WIDTH].borrow();
+        let next: &PrefixCols<AB::Var> = next_slice[..PREFIX_WIDTH].borrow();
+        let local_data = &local.prefix.data;
+        let start_timestamp = next.general.timestamp;
+        let multiplicity =
+            local.prefix.general.enabled * (AB::Expr::ONE - local.prefix.general.is_ins_row);
+        // a_ptr/b_ptr/length/result
+        let ptr_reads = AB::F::from_canonical_usize(4);
+        // read a
+        self.memory_bridge
+            .read(
+                MemoryAddress::new(local_data.addr_space, next.data.a_ptr),
+                [local.prefix.a_or_is_first],
+                start_timestamp + ptr_reads,
+                &local.a_aux,
+            )
+            .eval(builder, multiplicity.clone());
+        // read b
+        self.memory_bridge
+            .read(
+                MemoryAddress::new(local_data.addr_space, next.data.b_ptr),
+                local.b,
+                start_timestamp + ptr_reads + AB::Expr::ONE,
+                &local.b_aux,
+            )
+            .eval(builder, multiplicity);
+        {
+            let mut when_transition = builder.when_transition();
+            let mut not_ins_row =
+                when_transition.when_ne(local.prefix.general.is_ins_row, AB::F::ONE);
+            let mut builder = not_ins_row.when(next.general.enabled);
+            // ATTENTION: degree of builder is 2
+            // local.timestamp = next.timestamp + 2
+            builder.assert_eq(
+                local.prefix.general.timestamp,
+                start_timestamp + AB::Expr::TWO,
             );
+            // local.idx = next.idx + 1
+            builder.assert_eq(local_data.idx + AB::Expr::ONE, next.data.idx);
+            // local.alpha = next.alpha
+            assert_array_eq(&mut builder, local_data.alpha, next.data.alpha);
+            // local.addr_space = next.addr_space
+            builder.assert_eq(local_data.addr_space, next.data.addr_space);
+            // local.a_ptr = next.a_ptr + 1
+            builder.assert_eq(local_data.a_ptr, next.data.a_ptr + AB::F::ONE);
+            // local.b_ptr = next.b_ptr + EXT_DEG
+            builder.assert_eq(
+                local_data.b_ptr,
+                next.data.b_ptr + AB::F::from_canonical_usize(EXT_DEG),
+            );
+            // local.timestamp = next.timestamp + 2
+            builder.assert_eq(
+                local.prefix.general.timestamp,
+                next.general.timestamp + AB::Expr::TWO,
+            );
+            // local.result * local.alpha + local.b - local.a = next.result
+            let mut expected_result = FieldExtension::multiply(local_data.result, local_data.alpha);
+            expected_result
+                .iter_mut()
+                .zip(local.b.iter())
+                .for_each(|(e, b)| {
+                    *e += (*b).into();
+                });
+            expected_result[0] -= local.prefix.a_or_is_first.into();
+            assert_array_eq(&mut builder, expected_result, next.data.result);
         }
+        {
+            let mut next_ins = builder.when(next.general.is_ins_row);
+            let mut local_non_ins =
+                next_ins.when_ne(local.prefix.general.is_ins_row, AB::Expr::ONE);
+            // The row after a workload row can only be the first instruction row.
+            local_non_ins.assert_one(next.a_or_is_first);
+        }
+        {
+            let mut when_first_row = builder.when_first_row();
+            let mut when_enabled = when_first_row.when(local.prefix.general.enabled);
+            // First row must be a workload row.
+            when_enabled.assert_zero(local.prefix.general.is_ins_row);
+            // Workload rows must start with the first element.
+            when_enabled.assert_zero(local.prefix.data.idx);
+            // local.result is all 0s.
+            assert_array_eq(
+                &mut when_enabled,
+                local.prefix.data.result,
+                [AB::Expr::ZERO; EXT_DEG],
+            );
+        }
+    }
 
-        // is zero constraint
-        let is_zero_io = IsZeroIo::new(idx.into(), idx_is_zero.into(), AB::Expr::ONE);
-        IsZeroSubAir.eval(builder, (is_zero_io, is_zero_aux));
-
-        // length will only be used on the last row, so it equals 1 + idx
-        let length = AB::Expr::ONE + idx;
-        let num_initial_accesses = AB::F::from_canonical_usize(4);
-        let num_loop_accesses = AB::Expr::TWO * length.clone();
-        let num_final_accesses = AB::F::TWO;
-
-        // execution interaction
-        let total_accesses = num_loop_accesses.clone() + num_initial_accesses + num_final_accesses;
+    fn eval_instruction1_row<AB: InteractionBuilder>(
+        &self,
+        builder: &mut AB,
+        local_slice: &[AB::Var],
+        next_slice: &[AB::Var],
+    ) {
+        let local: &Instruction1Cols<AB::Var> = local_slice[..INS_1_WIDTH].borrow();
+        let next: &Instruction2Cols<AB::Var> = next_slice[..INS_2_WIDTH].borrow();
+        let mut is_ins_row = builder.when(local.prefix.general.is_ins_row);
+        let mut is_first_ins = is_ins_row.when(local.prefix.a_or_is_first);
+        let mut next_enabled = is_first_ins.when(next.general.enabled);
+        // ATTENTION: degree of next_enabled is 3
+        next_enabled.assert_zero(next.is_first);
+        next_enabled.assert_one(next.general.is_ins_row);
+
+        let local_data = &local.prefix.data;
+        let length = local.prefix.data.idx;
+        let multiplicity = local.prefix.general.enabled
+            * local.prefix.general.is_ins_row
+            * local.prefix.a_or_is_first;
+        let start_timestamp = local.prefix.general.timestamp;
+        // 4 reads
+        let write_timestamp =
+            start_timestamp + AB::Expr::TWO * length + AB::Expr::from_canonical_usize(4);
+        let end_timestamp = write_timestamp.clone() + AB::Expr::ONE;
         self.execution_bridge
             .execute(
                 AB::F::from_canonical_usize(FRI_REDUCED_OPENING.global_opcode().as_usize()),
                 [
-                    a_ptr_ptr,
-                    b_ptr_ptr,
-                    result_ptr,
-                    addr_space,
-                    length_ptr,
-                    alpha_ptr,
-                    alpha_pow_ptr,
+                    local.a_ptr_ptr,
+                    local.b_ptr_ptr,
+                    next.result_ptr,
+                    local_data.addr_space,
+                    next.length_ptr,
+                    next.alpha_ptr,
                 ],
-                ExecutionState::new(pc, start_timestamp),
+                ExecutionState::new(local.pc, local.prefix.general.timestamp),
                 ExecutionState::<AB::Expr>::new(
-                    AB::Expr::from_canonical_u32(DEFAULT_PC_STEP) + pc,
-                    start_timestamp + total_accesses,
+                    AB::Expr::from_canonical_u32(DEFAULT_PC_STEP) + local.pc,
+                    end_timestamp.clone(),
                 ),
             )
-            .eval(builder, enabled * is_last);
-
-        // initial reads
+            .eval(builder, multiplicity.clone());
+        // Read alpha
         self.memory_bridge
             .read(
-                MemoryAddress::new(addr_space, alpha_ptr),
-                alpha,
+                MemoryAddress::new(local_data.addr_space, next.alpha_ptr),
+                local_data.alpha,
                 start_timestamp,
-                &alpha_aux,
+                &next.alpha_aux,
             )
-            .eval(builder, enabled * is_last);
+            .eval(builder, multiplicity.clone());
+        // Read length
         self.memory_bridge
             .read(
-                MemoryAddress::new(addr_space, length_ptr),
+                MemoryAddress::new(local_data.addr_space, next.length_ptr),
                 [length],
-                start_timestamp + AB::F::ONE,
-                &length_aux,
-            )
-            .eval(builder, enabled * is_last);
-        self.memory_bridge
-            .read(
-                MemoryAddress::new(addr_space, a_ptr_ptr),
-                [a_ptr],
-                start_timestamp + AB::F::TWO,
-                &a_ptr_aux,
+                start_timestamp + AB::Expr::ONE,
+                &next.length_aux,
             )
-            .eval(builder, enabled * is_last);
+            .eval(builder, multiplicity.clone());
+        // Read a_ptr
         self.memory_bridge
             .read(
-                MemoryAddress::new(addr_space, b_ptr_ptr),
-                [b_ptr],
-                start_timestamp + AB::F::from_canonical_usize(3),
-                &b_ptr_aux,
+                MemoryAddress::new(local_data.addr_space, local.a_ptr_ptr),
+                [local_data.a_ptr],
+                start_timestamp + AB::Expr::TWO,
+                &local.a_ptr_aux,
             )
-            .eval(builder, enabled * is_last);
-
-        // general reads
-        let timestamp = start_timestamp + num_initial_accesses + (idx * AB::F::TWO);
+            .eval(builder, multiplicity.clone());
+        // Read b_ptr
         self.memory_bridge
             .read(
-                MemoryAddress::new(addr_space, a_ptr + idx),
-                [a],
-                timestamp.clone(),
-                &a_aux,
+                MemoryAddress::new(local_data.addr_space, local.b_ptr_ptr),
+                [local_data.b_ptr],
+                start_timestamp + AB::Expr::from_canonical_u32(3),
+                &local.b_ptr_aux,
             )
-            .eval(builder, enabled);
-        self.memory_bridge
-            .read(
-                MemoryAddress::new(
-                    addr_space,
-                    b_ptr + (idx * AB::F::from_canonical_usize(EXT_DEG)),
-                ),
-                b,
-                timestamp + AB::F::ONE,
-                &b_aux,
-            )
-            .eval(builder, enabled);
-
-        // final writes
-        let timestamp = start_timestamp + num_initial_accesses + num_loop_accesses.clone();
-        self.memory_bridge
-            .write(
-                MemoryAddress::new(addr_space, alpha_pow_ptr),
-                FieldExtension::multiply(alpha, alpha_pow_current),
-                timestamp.clone(),
-                &MemoryWriteAuxCols {
-                    base: alpha_pow_aux,
-                    prev_data: alpha_pow_original,
-                },
-            )
-            .eval(builder, enabled * is_last);
+            .eval(builder, multiplicity.clone());
         self.memory_bridge
             .write(
-                MemoryAddress::new(addr_space, result_ptr),
-                current,
-                timestamp + AB::F::ONE,
-                &result_aux,
+                MemoryAddress::new(local_data.addr_space, next.result_ptr),
+                local_data.result,
+                write_timestamp,
+                &next.result_aux,
             )
-            .eval(builder, enabled * is_last);
+            .eval(builder, multiplicity.clone());
+    }
+
+    fn eval_instruction2_row<AB: InteractionBuilder>(
+        &self,
+        builder: &mut AB,
+        local_slice: &[AB::Var],
+        next_slice: &[AB::Var],
+    ) {
+        let local: &Instruction2Cols<AB::Var> = local_slice[..INS_2_WIDTH].borrow();
+        let next: &WorkloadCols<AB::Var> = next_slice[..WL_WIDTH].borrow();
+
+        {
+            let mut last_row = builder.when_last_row();
+            let mut enabled = last_row.when(local.general.enabled);
+            // If the last row is enabled, it must be the second row of an instruction row. This
+            // is a safeguard for edge cases.
+            enabled.assert_one(local.general.is_ins_row);
+            enabled.assert_one(local.is_first);
+        }
+        {
+            let mut when_transition = builder.when_transition();
+            let mut is_ins_row = when_transition.when(local.general.is_ins_row);
+            let mut not_first_row = is_ins_row.when_ne(local.is_first, AB::Expr::ONE);
+            // when_transition is necessary to check the next row is enabled.
+            let mut enabled = not_first_row.when(next.prefix.general.enabled);
+            // The next row must be a workload row.
+            enabled.assert_zero(next.prefix.general.is_ins_row);
+            // The next row must have idx = 0.
+            enabled.assert_zero(next.prefix.data.idx);
+            // next.result is all 0s
+            assert_array_eq(
+                &mut enabled,
+                next.prefix.data.result,
+                [AB::Expr::ZERO; EXT_DEG],
+            );
+        }
+    }
+}
+
+fn assert_array_eq<AB: AirBuilder, I1: Into<AB::Expr>, I2: Into<AB::Expr>, const N: usize>(
+    builder: &mut AB,
+    x: [I1; N],
+    y: [I2; N],
+) {
+    for (x, y) in zip_eq(x, y) {
+        builder.assert_eq(x, y);
     }
 }
 
+fn elem_to_ext<F: Field>(elem: F) -> [F; EXT_DEG] {
+    let mut ret = [F::ZERO; EXT_DEG];
+    ret[0] = elem;
+    ret
+}
+
 #[derive(Serialize, Deserialize)]
 #[serde(bound = "F: Field")]
 pub struct FriReducedOpeningRecord<F: Field> {
@@ -306,18 +438,22 @@ pub struct FriReducedOpeningRecord<F: Field> {
     pub b_ptr_read: RecordId,
     pub a_reads: Vec<RecordId>,
     pub b_reads: Vec<RecordId>,
-    pub alpha_pow_original: [F; EXT_DEG],
-    pub alpha_pow_write: RecordId,
     pub result_write: RecordId,
 }
 
+impl<F: Field> FriReducedOpeningRecord<F> {
+    fn get_height(&self) -> usize {
+        // 2 for instruction rows
+        self.a_reads.len() + 2
+    }
+}
+
 pub struct FriReducedOpeningChip<F: Field> {
     air: FriReducedOpeningAir,
     records: Vec<FriReducedOpeningRecord<F>>,
     height: usize,
     offline_memory: Arc<Mutex<OfflineMemory<F>>>,
 }
-
 impl<F: PrimeField32> FriReducedOpeningChip<F> {
     pub fn new(
         execution_bus: ExecutionBus,
@@ -337,13 +473,6 @@ impl<F: PrimeField32> FriReducedOpeningChip<F> {
         }
     }
 }
-
-fn elem_to_ext<F: Field>(elem: F) -> [F; EXT_DEG] {
-    let mut ret = [F::ZERO; EXT_DEG];
-    ret[0] = elem;
-    ret
-}
-
 impl<F: PrimeField32> InstructionExecutor<F> for FriReducedOpeningChip<F> {
     fn execute(
         &mut self,
@@ -358,7 +487,6 @@ impl<F: PrimeField32> InstructionExecutor<F> for FriReducedOpeningChip<F> {
             d: addr_space,
             e: length_ptr,
             f: alpha_ptr,
-            g: alpha_pow_ptr,
             ..
         } = instruction;
 
@@ -368,10 +496,6 @@ impl<F: PrimeField32> InstructionExecutor<F> for FriReducedOpeningChip<F> {
         let b_ptr_read = memory.read_cell(addr_space, b_ptr_ptr);
 
         let alpha = alpha_read.1;
-        let alpha_pow_original = from_fn(|i| {
-            memory.unsafe_read_cell(addr_space, alpha_pow_ptr + F::from_canonical_usize(i))
-        });
-        let mut alpha_pow = alpha_pow_original;
         let length = length_read.1.as_canonical_u32() as usize;
         let a_ptr = a_ptr_read.1;
         let b_ptr = b_ptr_read.1;
@@ -382,23 +506,25 @@ impl<F: PrimeField32> InstructionExecutor<F> for FriReducedOpeningChip<F> {
 
         for i in 0..length {
             let a_read = memory.read_cell(addr_space, a_ptr + F::from_canonical_usize(i));
-            let b_read = memory.read(addr_space, b_ptr + F::from_canonical_usize(4 * i));
+            let b_read =
+                memory.read::<EXT_DEG>(addr_space, b_ptr + F::from_canonical_usize(EXT_DEG * i));
             a_reads.push(a_read);
             b_reads.push(b_read);
+        }
+
+        for (a_read, b_read) in a_reads.iter().rev().zip_eq(b_reads.iter().rev()) {
             let a = a_read.1;
             let b = b_read.1;
+            // result = result * alpha + (b - a)
             result = FieldExtension::add(
-                result,
-                FieldExtension::multiply(FieldExtension::subtract(b, elem_to_ext(a)), alpha_pow),
+                FieldExtension::multiply(result, alpha),
+                FieldExtension::subtract(b, elem_to_ext(a)),
             );
-            alpha_pow = FieldExtension::multiply(alpha, alpha_pow);
         }
 
-        let (alpha_pow_write, prev_data) = memory.write(addr_space, alpha_pow_ptr, alpha_pow);
-        debug_assert_eq!(prev_data, alpha_pow_original);
         let (result_write, _) = memory.write(addr_space, result_ptr, result);
 
-        self.records.push(FriReducedOpeningRecord {
+        let record = FriReducedOpeningRecord {
             pc: F::from_canonical_u32(from_state.pc),
             start_timestamp: F::from_canonical_u32(from_state.timestamp),
             instruction: instruction.clone(),
@@ -408,12 +534,10 @@ impl<F: PrimeField32> InstructionExecutor<F> for FriReducedOpeningChip<F> {
             b_ptr_read: b_ptr_read.0,
             a_reads: a_reads.into_iter().map(|r| r.0).collect(),
             b_reads: b_reads.into_iter().map(|r| r.0).collect(),
-            alpha_pow_original,
-            alpha_pow_write,
             result_write,
-        });
-
-        self.height += length;
+        };
+        self.height += record.get_height();
+        self.records.push(record);
 
         Ok(ExecutionState {
             pc: from_state.pc + DEFAULT_PC_STEP,
@@ -427,6 +551,132 @@ impl<F: PrimeField32> InstructionExecutor<F> for FriReducedOpeningChip<F> {
     }
 }
 
+fn record_to_rows<F: PrimeField32>(
+    record: FriReducedOpeningRecord<F>,
+    aux_cols_factory: &MemoryAuxColsFactory<F>,
+    slice: &mut [F],
+    memory: &OfflineMemory<F>,
+) {
+    let Instruction {
+        a: a_ptr_ptr,
+        b: b_ptr_ptr,
+        c: result_ptr,
+        d: addr_space,
+        e: length_ptr,
+        f: alpha_ptr,
+        ..
+    } = record.instruction;
+
+    let length_read = memory.record_by_id(record.length_read);
+    let alpha_read = memory.record_by_id(record.alpha_read);
+    let a_ptr_read = memory.record_by_id(record.a_ptr_read);
+    let b_ptr_read = memory.record_by_id(record.b_ptr_read);
+
+    let length = length_read.data[0].as_canonical_u32() as usize;
+    let alpha: [F; EXT_DEG] = array::from_fn(|i| alpha_read.data[i]);
+    let a_ptr = a_ptr_read.data[0];
+    let b_ptr = b_ptr_read.data[0];
+
+    let mut result = [F::ZERO; EXT_DEG];
+
+    let alpha_aux = aux_cols_factory.make_read_aux_cols(alpha_read);
+    let length_aux = aux_cols_factory.make_read_aux_cols(length_read);
+    let a_ptr_aux = aux_cols_factory.make_read_aux_cols(a_ptr_read);
+    let b_ptr_aux = aux_cols_factory.make_read_aux_cols(b_ptr_read);
+
+    let result_aux = aux_cols_factory.make_write_aux_cols(memory.record_by_id(record.result_write));
+
+    // WorkloadCols
+    for (i, (&a_record_id, &b_record_id)) in record
+        .a_reads
+        .iter()
+        .rev()
+        .zip_eq(record.b_reads.iter().rev())
+        .enumerate()
+    {
+        let a_read = memory.record_by_id(a_record_id);
+        let b_read = memory.record_by_id(b_record_id);
+        let a = a_read.data[0];
+        let b: [F; EXT_DEG] = array::from_fn(|i| b_read.data[i]);
+
+        let start = i * OVERALL_WIDTH;
+        let cols: &mut WorkloadCols<F> = slice[start..start + WL_WIDTH].borrow_mut();
+        *cols = WorkloadCols {
+            prefix: PrefixCols {
+                general: GeneralCols {
+                    enabled: F::ONE,
+                    is_ins_row: F::ZERO,
+                    timestamp: record.start_timestamp + F::from_canonical_usize((length - i) * 2),
+                },
+                a_or_is_first: a,
+                data: DataCols {
+                    addr_space,
+                    a_ptr: a_ptr + F::from_canonical_usize(length - i),
+                    b_ptr: b_ptr + F::from_canonical_usize((length - i) * EXT_DEG),
+                    idx: F::from_canonical_usize(i),
+                    result,
+                    alpha,
+                },
+            },
+            a_aux: aux_cols_factory.make_read_aux_cols(a_read),
+            b,
+            b_aux: aux_cols_factory.make_read_aux_cols(b_read),
+        };
+        // result = result * alpha + (b - a)
+        result = FieldExtension::add(
+            FieldExtension::multiply(result, alpha),
+            FieldExtension::subtract(b, elem_to_ext(a)),
+        );
+    }
+    // Instruction1Cols
+    {
+        let start = length * OVERALL_WIDTH;
+        let cols: &mut Instruction1Cols<F> = slice[start..start + INS_1_WIDTH].borrow_mut();
+        *cols = Instruction1Cols {
+            prefix: PrefixCols {
+                general: GeneralCols {
+                    enabled: F::ONE,
+                    is_ins_row: F::ONE,
+                    timestamp: record.start_timestamp,
+                },
+                a_or_is_first: F::ONE,
+                data: DataCols {
+                    addr_space,
+                    a_ptr,
+                    b_ptr,
+                    idx: F::from_canonical_usize(length),
+                    result,
+                    alpha,
+                },
+            },
+            pc: record.pc,
+            a_ptr_ptr,
+            a_ptr_aux,
+            b_ptr_ptr,
+            b_ptr_aux,
+        };
+    }
+    // Instruction2Cols
+    {
+        let start = (length + 1) * OVERALL_WIDTH;
+        let cols: &mut Instruction2Cols<F> = slice[start..start + INS_2_WIDTH].borrow_mut();
+        *cols = Instruction2Cols {
+            general: GeneralCols {
+                enabled: F::ONE,
+                is_ins_row: F::ONE,
+                timestamp: record.start_timestamp,
+            },
+            is_first: F::ZERO,
+            result_ptr,
+            result_aux,
+            length_ptr,
+            length_aux,
+            alpha_ptr,
+            alpha_aux,
+        };
+    }
+}
+
 impl<F: Field> ChipUsageGetter for FriReducedOpeningChip<F> {
     fn air_name(&self) -> String {
         "FriReducedOpeningAir".to_string()
@@ -437,141 +687,7 @@ impl<F: Field> ChipUsageGetter for FriReducedOpeningChip<F> {
     }
 
     fn trace_width(&self) -> usize {
-        FriReducedOpeningCols::<F>::width()
-    }
-}
-
-impl<F: PrimeField32> FriReducedOpeningChip<F> {
-    fn record_to_rows(
-        record: FriReducedOpeningRecord<F>,
-        aux_cols_factory: &MemoryAuxColsFactory<F>,
-        slice: &mut [F],
-        memory: &OfflineMemory<F>,
-    ) {
-        let width = FriReducedOpeningCols::<F>::width();
-
-        let Instruction {
-            a: a_ptr_ptr,
-            b: b_ptr_ptr,
-            c: result_ptr,
-            d: addr_space,
-            e: length_ptr,
-            f: alpha_ptr,
-            g: alpha_pow_ptr,
-            ..
-        } = record.instruction;
-
-        let length_read = memory.record_by_id(record.length_read);
-        let alpha_read = memory.record_by_id(record.alpha_read);
-        let a_ptr_read = memory.record_by_id(record.a_ptr_read);
-        let b_ptr_read = memory.record_by_id(record.b_ptr_read);
-
-        let length = length_read.data[0].as_canonical_u32() as usize;
-        let alpha: [F; EXT_DEG] = array::from_fn(|i| alpha_read.data[i]);
-        let a_ptr = a_ptr_read.data[0];
-        let b_ptr = b_ptr_read.data[0];
-
-        let mut alpha_pow_current = record.alpha_pow_original;
-        let mut current = [F::ZERO; EXT_DEG];
-
-        let alpha_aux = aux_cols_factory.make_read_aux_cols(alpha_read);
-        let length_aux = aux_cols_factory.make_read_aux_cols(length_read);
-        let a_ptr_aux = aux_cols_factory.make_read_aux_cols(a_ptr_read);
-        let b_ptr_aux = aux_cols_factory.make_read_aux_cols(b_ptr_read);
-
-        let alpha_pow_aux = aux_cols_factory
-            .make_write_aux_cols::<EXT_DEG>(memory.record_by_id(record.alpha_pow_write))
-            .get_base();
-        let result_aux =
-            aux_cols_factory.make_write_aux_cols(memory.record_by_id(record.result_write));
-
-        for i in 0..length {
-            let a_read = memory.record_by_id(record.a_reads[i]);
-            let b_read = memory.record_by_id(record.b_reads[i]);
-            let a = a_read.data[0];
-            let b: [F; EXT_DEG] = array::from_fn(|i| b_read.data[i]);
-            current = FieldExtension::add(
-                current,
-                FieldExtension::multiply(
-                    FieldExtension::subtract(b, elem_to_ext(a)),
-                    alpha_pow_current,
-                ),
-            );
-
-            let mut idx_is_zero = F::ZERO;
-            let mut is_zero_aux = F::ZERO;
-
-            let idx = F::from_canonical_usize(i);
-            IsZeroSubAir.generate_subrow(idx, (&mut is_zero_aux, &mut idx_is_zero));
-
-            let cols: &mut FriReducedOpeningCols<F> =
-                slice[i * width..(i + 1) * width].borrow_mut();
-            *cols = FriReducedOpeningCols {
-                enabled: F::ONE,
-                pc: record.pc,
-                a_ptr_ptr,
-                b_ptr_ptr,
-                result_ptr,
-                addr_space,
-                length_ptr,
-                alpha_ptr,
-                alpha_pow_ptr,
-                start_timestamp: record.start_timestamp,
-                a_ptr_aux,
-                b_ptr_aux,
-                a_aux: aux_cols_factory.make_read_aux_cols(a_read),
-                b_aux: aux_cols_factory.make_read_aux_cols(b_read),
-                alpha_aux,
-                length_aux,
-                alpha_pow_aux,
-                result_aux,
-                a_ptr,
-                b_ptr,
-                a,
-                b,
-                alpha,
-                alpha_pow_original: record.alpha_pow_original,
-                alpha_pow_current,
-                idx,
-                idx_is_zero,
-                is_zero_aux,
-                current,
-            };
-
-            alpha_pow_current = FieldExtension::multiply(alpha, alpha_pow_current);
-        }
-    }
-
-    fn generate_trace(self) -> RowMajorMatrix<F> {
-        let width = self.trace_width();
-        let height = next_power_of_two_or_zero(self.height);
-        let mut flat_trace = F::zero_vec(width * height);
-
-        let memory = self.offline_memory.lock().unwrap();
-
-        let aux_cols_factory = memory.aux_cols_factory();
-
-        let mut idx = 0;
-        for record in self.records {
-            let length = record.a_reads.len();
-            Self::record_to_rows(
-                record,
-                &aux_cols_factory,
-                &mut flat_trace[idx..idx + (length * width)],
-                &memory,
-            );
-            idx += length * width;
-        }
-        // In padding rows, need idx_is_zero = 1 so IsZero constraints pass, and also because next.idx_is_zero is used
-        // to determine the last row per instruction, so the last non-padding row needs next.idx_is_zero = 1
-        flat_trace[self.height * width..]
-            .par_chunks_mut(width)
-            .for_each(|row| {
-                let row: &mut FriReducedOpeningCols<F> = row.borrow_mut();
-                row.idx_is_zero = F::ONE;
-            });
-
-        RowMajorMatrix::new(flat_trace, width)
+        OVERALL_WIDTH
     }
 }
 
@@ -583,17 +699,51 @@ where
         Arc::new(self.air)
     }
     fn generate_air_proof_input(self) -> AirProofInput<SC> {
-        AirProofInput::simple_no_pis(self.air(), self.generate_trace())
+        let air = self.air();
+        let height = next_power_of_two_or_zero(self.height);
+        let mut flat_trace = Val::<SC>::zero_vec(OVERALL_WIDTH * height);
+        let chunked_trace = {
+            let sizes: Vec<_> = self
+                .records
+                .par_iter()
+                .map(|record| OVERALL_WIDTH * record.get_height())
+                .collect();
+            variable_chunks_mut(&mut flat_trace, &sizes)
+        };
+
+        let memory = self.offline_memory.lock().unwrap();
+        let aux_cols_factory = memory.aux_cols_factory();
+
+        self.records
+            .into_par_iter()
+            .zip_eq(chunked_trace.into_par_iter())
+            .for_each(|(record, slice)| {
+                record_to_rows(record, &aux_cols_factory, slice, &memory);
+            });
+
+        let matrix = RowMajorMatrix::new(flat_trace, OVERALL_WIDTH);
+        AirProofInput::simple_no_pis(air, matrix)
     }
 }
 
 impl<F: PrimeField32> Stateful<Vec<u8>> for FriReducedOpeningChip<F> {
     fn load_state(&mut self, state: Vec<u8>) {
         self.records = bitcode::deserialize(&state).unwrap();
-        self.height = self.records.iter().map(|record| record.a_reads.len()).sum();
+        self.height = self.records.iter().map(|record| record.get_height()).sum();
     }
 
     fn store_state(&self) -> Vec<u8> {
         bitcode::serialize(&self.records).unwrap()
     }
 }
+
+fn variable_chunks_mut<'a, T>(mut slice: &'a mut [T], sizes: &[usize]) -> Vec<&'a mut [T]> {
+    let mut result = Vec::with_capacity(sizes.len());
+    for &size in sizes {
+        // split_at_mut guarantees disjoint slices
+        let (left, right) = slice.split_at_mut(size);
+        result.push(left);
+        slice = right; // move forward for the next chunk
+    }
+    result
+}
diff --git a/extensions/native/circuit/src/fri/tests.rs b/extensions/native/circuit/src/fri/tests.rs
index 1fd138ddde..ae757d3e8c 100644
--- a/extensions/native/circuit/src/fri/tests.rs
+++ b/extensions/native/circuit/src/fri/tests.rs
@@ -7,20 +7,23 @@ use openvm_stark_backend::{
     utils::disable_debug_builder,
     verifier::VerificationError,
 };
-use openvm_stark_sdk::{p3_baby_bear::BabyBear, utils::create_seeded_rng};
+use openvm_stark_sdk::{
+    config::{baby_bear_poseidon2::BabyBearPoseidon2Engine, FriParameters},
+    engine::StarkFriEngine,
+    p3_baby_bear::BabyBear,
+    utils::create_seeded_rng,
+};
 use rand::Rng;
 
-use super::{
-    super::field_extension::FieldExtension, elem_to_ext, FriReducedOpeningChip,
-    FriReducedOpeningCols, EXT_DEG,
-};
+use super::{super::field_extension::FieldExtension, elem_to_ext, FriReducedOpeningChip, EXT_DEG};
+use crate::OVERALL_WIDTH;
 
 fn compute_fri_mat_opening<F: Field>(
     alpha: [F; EXT_DEG],
-    mut alpha_pow: [F; EXT_DEG],
     a: &[F],
     b: &[[F; EXT_DEG]],
-) -> ([F; EXT_DEG], [F; EXT_DEG]) {
+) -> [F; EXT_DEG] {
+    let mut alpha_pow: [F; EXT_DEG] = elem_to_ext(F::ONE);
     let mut result = [F::ZERO; EXT_DEG];
     for (&a, &b) in a.iter().zip_eq(b) {
         result = FieldExtension::add(
@@ -29,7 +32,7 @@ fn compute_fri_mat_opening<F: Field>(
         );
         alpha_pow = FieldExtension::multiply(alpha, alpha_pow);
     }
-    (alpha_pow, result)
+    result
 }
 
 #[test]
@@ -60,19 +63,17 @@ fn fri_mat_opening_air_test() {
     for _ in 0..num_ops {
         let alpha = gen_ext!();
         let length = rng.gen_range(length_range());
-        let alpha_pow_initial = gen_ext!();
         let a = (0..length)
             .map(|_| BabyBear::from_canonical_u32(rng.gen_range(elem_range())))
             .collect_vec();
         let b = (0..length).map(|_| gen_ext!()).collect_vec();
 
-        let (alpha_pow_final, result) = compute_fri_mat_opening(alpha, alpha_pow_initial, &a, &b);
+        let result = compute_fri_mat_opening(alpha, &a, &b);
 
         let alpha_pointer = gen_pointer(&mut rng, 4);
         let length_pointer = gen_pointer(&mut rng, 1);
         let a_pointer_pointer = gen_pointer(&mut rng, 1);
         let b_pointer_pointer = gen_pointer(&mut rng, 1);
-        let alpha_pow_pointer = gen_pointer(&mut rng, 4);
         let result_pointer = gen_pointer(&mut rng, 4);
         let a_pointer = gen_pointer(&mut rng, 1);
         let b_pointer = gen_pointer(&mut rng, 4);
@@ -100,7 +101,6 @@ fn fri_mat_opening_air_test() {
             b_pointer_pointer,
             BabyBear::from_canonical_usize(b_pointer),
         );
-        tester.write(address_space, alpha_pow_pointer, alpha_pow_initial);
         for i in 0..length {
             tester.write_cell(address_space, a_pointer + i, a[i]);
             tester.write(address_space, b_pointer + (4 * i), b[i]);
@@ -117,19 +117,21 @@ fn fri_mat_opening_air_test() {
                     address_space,
                     length_pointer,
                     alpha_pointer,
-                    alpha_pow_pointer,
                 ],
             ),
         );
-        assert_eq!(
-            alpha_pow_final,
-            tester.read(address_space, alpha_pow_pointer)
-        );
         assert_eq!(result, tester.read(address_space, result_pointer));
     }
 
-    let mut tester = tester.build().load(chip).finalize();
-    tester.simple_test().expect("Verification failed");
+    let mut tester = tester.build_babybear_poseidon2().load(chip).finalize();
+    // Degree needs >= 4
+    tester
+        .test::<BabyBearPoseidon2Engine, _>(|| {
+            BabyBearPoseidon2Engine::new(
+                FriParameters::standard_with_100_bits_conjectured_security(2),
+            )
+        })
+        .expect("Verification failed");
 
     disable_debug_builder();
     // negative test pranking each value
@@ -137,7 +139,7 @@ fn fri_mat_opening_air_test() {
         // TODO: better way to modify existing traces in tester
         let trace = tester.air_proof_inputs[2].raw.common_main.as_mut().unwrap();
         let old_trace = trace.clone();
-        for width in 0..FriReducedOpeningCols::<BabyBear>::width()
+        for width in 0..OVERALL_WIDTH
         /* num operands */
         {
             let prank_value = BabyBear::from_canonical_u32(rng.gen_range(1..=100));
@@ -146,7 +148,13 @@ fn fri_mat_opening_air_test() {
 
         // Run a test after pranking each row
         assert_eq!(
-            tester.simple_test().err(),
+            tester
+                .test::<BabyBearPoseidon2Engine, _>(|| {
+                    BabyBearPoseidon2Engine::new(
+                        FriParameters::standard_with_100_bits_conjectured_security(2),
+                    )
+                })
+                .err(),
             Some(VerificationError::OodEvaluationMismatch),
             "Expected constraint to fail"
         );
diff --git a/extensions/native/circuit/src/utils.rs b/extensions/native/circuit/src/utils.rs
index 0c188998c7..09635e1dc3 100644
--- a/extensions/native/circuit/src/utils.rs
+++ b/extensions/native/circuit/src/utils.rs
@@ -7,7 +7,8 @@ use crate::{Native, NativeConfig};
 pub fn execute_program(program: Program<BabyBear>, input_stream: impl Into<Streams<BabyBear>>) {
     let system_config = SystemConfig::default()
         .with_public_values(4)
-        .with_max_segment_len((1 << 25) - 100);
+        .with_max_segment_len((1 << 25) - 100)
+        .with_profiling();
     let config = NativeConfig::new(system_config, Native);
     let executor = VmExecutor::<BabyBear, NativeConfig>::new(config);
 
diff --git a/extensions/native/compiler/src/asm/compiler.rs b/extensions/native/compiler/src/asm/compiler.rs
index 3d2911165e..086136fb0d 100644
--- a/extensions/native/compiler/src/asm/compiler.rs
+++ b/extensions/native/compiler/src/asm/compiler.rs
@@ -547,7 +547,7 @@ impl<F: PrimeField32 + TwoAdicField, EF: ExtensionField<F> + TwoAdicField> AsmCo
                 DslIr::Halt => {
                     self.push(AsmInstruction::Halt, debug_info);
                 }
-                DslIr::FriReducedOpening(alpha, curr_alpha_pow, at_x_array, at_z_array, result) => {
+                DslIr::FriReducedOpening(alpha, at_x_array, at_z_array, result) => {
                     self.push(
                         AsmInstruction::FriReducedOpening(
                             at_x_array.ptr().fp(),
@@ -560,7 +560,6 @@ impl<F: PrimeField32 + TwoAdicField, EF: ExtensionField<F> + TwoAdicField> AsmCo
                                 Usize::Var(len) => len.fp(),
                             },
                             alpha.fp(),
-                            curr_alpha_pow.fp(),
                         ),
                         debug_info,
                     );
diff --git a/extensions/native/compiler/src/asm/instruction.rs b/extensions/native/compiler/src/asm/instruction.rs
index 665ba70eb8..bc60bd3115 100644
--- a/extensions/native/compiler/src/asm/instruction.rs
+++ b/extensions/native/compiler/src/asm/instruction.rs
@@ -116,8 +116,8 @@ pub enum AsmInstruction<F, EF> {
     /// (a, b, c) are memory pointers to (dst, lhs, rhs)
     Poseidon2Compress(i32, i32, i32),
 
-    /// (a, b, res, len, alpha, alpha_pow)
-    FriReducedOpening(i32, i32, i32, i32, i32, i32),
+    /// (a, b, res, len, alpha)
+    FriReducedOpening(i32, i32, i32, i32, i32),
 
     /// (dim, opened, opened_length, sibling, index, commit)
     /// opened values are field elements
@@ -358,11 +358,11 @@ impl<F: PrimeField32, EF: ExtensionField<F>> AsmInstruction<F, EF> {
             AsmInstruction::CycleTrackerEnd() => {
                 write!(f, "cycle_tracker_end")
             }
-            AsmInstruction::FriReducedOpening(a, b, res, len, alpha, alpha_pow) => {
+            AsmInstruction::FriReducedOpening(a, b, res, len, alpha) => {
                 write!(
                     f,
-                    "fri_mat_opening ({})fp, ({})fp, ({})fp, ({})fp, ({})fp, ({})fp",
-                    a, b, res, len, alpha, alpha_pow
+                    "fri_mat_opening ({})fp, ({})fp, ({})fp, ({})fp, ({})fp",
+                    a, b, res, len, alpha
                 )
             }
             AsmInstruction::VerifyBatchFelt(dim, opened, opened_length, sibling, index, commit) => {
diff --git a/extensions/native/compiler/src/conversion/mod.rs b/extensions/native/compiler/src/conversion/mod.rs
index c3f7c5365c..80029c7c38 100644
--- a/extensions/native/compiler/src/conversion/mod.rs
+++ b/extensions/native/compiler/src/conversion/mod.rs
@@ -464,7 +464,7 @@ fn convert_instruction<F: PrimeField32, EF: ExtensionField<F>>(
             AS::Native,
             AS::Native,
         )],
-        AsmInstruction::FriReducedOpening(a, b, res, len, alpha, alpha_pow) => vec![Instruction {
+        AsmInstruction::FriReducedOpening(a, b, res, len, alpha) => vec![Instruction {
             opcode: options.opcode_with_offset(FriOpcode::FRI_REDUCED_OPENING),
             a: i32_f(a),
             b: i32_f(b),
@@ -472,7 +472,7 @@ fn convert_instruction<F: PrimeField32, EF: ExtensionField<F>>(
             d: AS::Native.to_field(),
             e: i32_f(len),
             f: i32_f(alpha),
-            g: i32_f(alpha_pow),
+            g: F::ZERO,
         }],
         AsmInstruction::VerifyBatchFelt(dim, opened, opened_length, sibling, index, commit) => vec![Instruction {
             opcode: options.opcode_with_offset(VerifyBatchOpcode::VERIFY_BATCH),
diff --git a/extensions/native/compiler/src/ir/fri.rs b/extensions/native/compiler/src/ir/fri.rs
index 91de10762d..f96bb14de0 100644
--- a/extensions/native/compiler/src/ir/fri.rs
+++ b/extensions/native/compiler/src/ir/fri.rs
@@ -4,14 +4,12 @@ impl<C: Config> Builder<C> {
     pub fn fri_single_reduced_opening_eval(
         &mut self,
         alpha: Ext<C::F, C::EF>,
-        curr_alpha_pow: Ext<C::F, C::EF>,
         at_x_array: &Array<C, Felt<C::F>>,
         at_z_array: &Array<C, Ext<C::F, C::EF>>,
     ) -> Ext<C::F, C::EF> {
         let result = self.uninit();
         self.operations.push(crate::ir::DslIr::FriReducedOpening(
             alpha,
-            curr_alpha_pow,
             at_x_array.clone(),
             at_z_array.clone(),
             result,
diff --git a/extensions/native/compiler/src/ir/instructions.rs b/extensions/native/compiler/src/ir/instructions.rs
index a5d420457d..ac770fa9cd 100644
--- a/extensions/native/compiler/src/ir/instructions.rs
+++ b/extensions/native/compiler/src/ir/instructions.rs
@@ -252,9 +252,8 @@ pub enum DslIr<C: Config> {
     CircuitFeltReduce(Felt<C::F>),
     /// Halo2 only. Reduce an Ext so later computation becomes cheaper.
     CircuitExtReduce(Ext<C::F, C::EF>),
-    /// FriReducedOpening(alpha, curr_alpha_pow, at_x_array, at_z_array, result)
+    /// FriReducedOpening(alpha, at_x_array, at_z_array, result)
     FriReducedOpening(
-        Ext<C::F, C::EF>,
         Ext<C::F, C::EF>,
         Array<C, Felt<C::F>>,
         Array<C, Ext<C::F, C::EF>>,
diff --git a/extensions/native/compiler/tests/fri_ro_eval.rs b/extensions/native/compiler/tests/fri_ro_eval.rs
index 138653e087..30c4c2f726 100644
--- a/extensions/native/compiler/tests/fri_ro_eval.rs
+++ b/extensions/native/compiler/tests/fri_ro_eval.rs
@@ -53,7 +53,6 @@ fn test_single_reduced_opening_eval() {
         builder.assign(&cur_alpha_pow, cur_alpha_pow * alpha);
     });
     let expected_result = cur_ro;
-    let expected_final_alpha_pow = cur_alpha_pow;
 
     // prints don't work?
     /*builder.print_e(expected_result);
@@ -64,19 +63,15 @@ fn test_single_reduced_opening_eval() {
     let ext_1210 = builder.constant(EF::from_base_slice(&[F::ONE, F::TWO, F::ONE, F::ZERO]));
     builder.print_e(ext_1210);*/
 
-    let cur_alpha_pow: Ext<_, _> = builder.uninit();
     builder.assign(&cur_alpha_pow, initial_alpha_pow);
-    let single_ro_eval_res =
-        builder.fri_single_reduced_opening_eval(alpha, cur_alpha_pow, &mat_opening, &ps_at_z);
-    let actual_final_alpha_pow = cur_alpha_pow;
+    let single_ro_eval_res = builder.fri_single_reduced_opening_eval(alpha, &mat_opening, &ps_at_z);
     let actual_result: Ext<_, _> = builder.uninit();
-    builder.assign(&actual_result, single_ro_eval_res / (z - x));
+    builder.assign(&actual_result, single_ro_eval_res * cur_alpha_pow / (z - x));
 
     //builder.print_e(actual_result);
     //builder.print_e(actual_final_alpha_pow);
 
     builder.assert_ext_eq(expected_result, actual_result);
-    builder.assert_ext_eq(expected_final_alpha_pow, actual_final_alpha_pow);
 
     builder.halt();
 
diff --git a/extensions/native/recursion/src/fri/two_adic_pcs.rs b/extensions/native/recursion/src/fri/two_adic_pcs.rs
index c6eae873c1..28fd5393e1 100644
--- a/extensions/native/recursion/src/fri/two_adic_pcs.rs
+++ b/extensions/native/recursion/src/fri/two_adic_pcs.rs
@@ -238,17 +238,14 @@ pub fn verify_two_adic_pcs<C: Config>(
                                 builder.assign(&cur_ro, cur_ro + n / (z - x) * cur_alpha_pow);
                                 builder.assign(&cur_alpha_pow, cur_alpha_pow * mat_alpha_pow);
                             } else {
-                                // TODO: this is just for testing the correctness. Will remove later.
-                                let expected_alpha_pow: Ext<_, _> =
-                                    builder.eval(cur_alpha_pow * mat_alpha_pow);
                                 let mat_ro = builder.fri_single_reduced_opening_eval(
                                     alpha,
-                                    cur_alpha_pow,
                                     &mat_opening,
                                     &ps_at_z,
                                 );
-                                builder.assert_ext_eq(expected_alpha_pow, cur_alpha_pow);
-                                builder.assign(&cur_ro, cur_ro + (mat_ro / (z - x)));
+                                builder
+                                    .assign(&cur_ro, cur_ro + (mat_ro * cur_alpha_pow / (z - x)));
+                                builder.assign(&cur_alpha_pow, cur_alpha_pow * mat_alpha_pow);
                             }
 
                             builder.cycle_tracker_end("single-reduced-opening-eval");
diff --git a/extensions/native/recursion/src/testing_utils.rs b/extensions/native/recursion/src/testing_utils.rs
index 3dfd51c423..4d6e484304 100644
--- a/extensions/native/recursion/src/testing_utils.rs
+++ b/extensions/native/recursion/src/testing_utils.rs
@@ -65,13 +65,10 @@ pub mod inner {
         test_proof_input: ProofInputForTest<BabyBearPoseidon2Config>,
         fri_params: FriParameters,
     ) {
-        let ProofInputForTest {
-            per_air: air_proof_inputs,
-        } = test_proof_input;
-        let vparams =
-            <BabyBearPoseidon2Engine as StarkFriEngine<BabyBearPoseidon2Config>>::run_test_fast(
-                air_proof_inputs,
-            )
+        let vparams = test_proof_input
+            .run_test(&BabyBearPoseidon2Engine::new(
+                FriParameters::standard_with_100_bits_conjectured_security(2),
+            ))
             .unwrap();
 
         recursive_stark_test(

From 51b1611eec0d7bd0ca61031b0da65290ece7e561 Mon Sep 17 00:00:00 2001
From: Xinding Wei <weixinding@gmail.com>
Date: Thu, 23 Jan 2025 13:08:20 +0800
Subject: [PATCH 2/6] Remove redundant codes & update ISA docs

---
 docs/specs/ISA.md                             | 255 ++++++++++++------
 extensions/native/circuit/src/utils.rs        |   3 +-
 .../native/compiler/tests/fri_ro_eval.rs      |  13 -
 3 files changed, 171 insertions(+), 100 deletions(-)

diff --git a/docs/specs/ISA.md b/docs/specs/ISA.md
index 881bc22bf0..67128173a1 100644
--- a/docs/specs/ISA.md
+++ b/docs/specs/ISA.md
@@ -4,17 +4,23 @@
 
 ## Instruction format
 
-Instructions are encoded as a global opcode (field element) followed by `NUM_OPERANDS = 7` operands (field elements): `opcode, a, b, c, d, e, f, g`. An instruction does not need to use all operands, and trailing unused operands should be set to zero.
+Instructions are encoded as a global opcode (field element) followed by `NUM_OPERANDS = 7` operands (field elements):
+`opcode, a, b, c, d, e, f, g`. An instruction does not need to use all operands, and trailing unused operands should be
+set to zero.
 
 ## Program ROM
 
 Our VM operates under the Harvard architecture, where program code is stored separately from main
 memory. Code is addressed by any field element in range `[0, 2^PC_BITS)` where `PC_BITS = 30`.
 
-There is a single special purpose register `pc` for the program counter of type `F` which stores the location of the instruction being executed.
+There is a single special purpose register `pc` for the program counter of type `F` which stores the location of the
+instruction being executed.
 (We may extend `pc` to multiple field elements to increase the program address space size in the future.)
 
-The program code is committed as a cached trace. The validity of the program code and its cached trace must be checked outside of ZK. A valid program code must have all instructions stored at locations in range `[0, 2^PC_BITS)`. While the instructions can be stored at any locations, we will by default follow RISC-V in storing instructions at multiples of `DEFAULT_PC_STEP = 4`.
+The program code is committed as a cached trace. The validity of the program code and its cached trace must be checked
+outside of ZK. A valid program code must have all instructions stored at locations in range `[0, 2^PC_BITS)`. While the
+instructions can be stored at any locations, we will by default follow RISC-V in storing instructions at multiples of
+`DEFAULT_PC_STEP = 4`.
 
 ## Memory
 
@@ -22,8 +28,10 @@ Memory is comprised of addressable cells, each cell containing a single field el
 Instructions of the VM may access (read or write) memory
 as single cells or as a contiguous list of cells. Such a contiguous list is called a _block_, and
 a memory access (read/write) to a block is a _block access_.
-The architecture distinguishes between block accesses of different sizes as this has significant performance implications.
-The number of cells in a block access is restricted to powers of two, of which the following are supported: 1, 2, 4, 8, 16, 32, 64. Block accesses do not need to be
+The architecture distinguishes between block accesses of different sizes as this has significant performance
+implications.
+The number of cells in a block access is restricted to powers of two, of which the following are supported: 1, 2, 4, 8,
+16, 32, 64. Block accesses do not need to be
 aligned, i.e., a block access of size $N$ can start from a pointer with value not dividing $N$ (as an integer).
 
 We also leave open the possibility in the future that different address spaces (see below) can be dedicated to handling
@@ -51,12 +59,18 @@ addressable cells. Registers are represented using the [LIMB] format with `LIMB_
 ## Hints
 
 The `input_stream` is a private non-interactive queue of vectors of field elements which is provided at the start of
-runtime execution. The `hint_stream` is a queue of values that can be written to memory by calling the `HINT_STOREW_RV32` and `HINT_STORE_RV32` instructions. The `hint_stream` is populated via [phantom sub-instructions](#phantom-sub-instructions) such
+runtime execution. The `hint_stream` is a queue of values that can be written to memory by calling the
+`HINT_STOREW_RV32` and `HINT_STORE_RV32` instructions. The `hint_stream` is populated
+via [phantom sub-instructions](#phantom-sub-instructions) such
 as `HINT_INPUT` and `HINT_BITS`.
 
 ## Public Outputs
 
-By default, all inputs to the program are private (see [Hints](#hints)). At the end of program execution, a public list of user-specified field elements is output. The length of the list is a VM configuration parameter, and the list is initialized with zero elements. The VM has two configuration modes: continuations enabled and continuations disabled. When continuations are enabled, users can store values into the public output list via the `REVEAL_RV32` instruction. When continuations are disabled, users can store values into the public output list via the `PUBLISH` instruction.
+By default, all inputs to the program are private (see [Hints](#hints)). At the end of program execution, a public list
+of user-specified field elements is output. The length of the list is a VM configuration parameter, and the list is
+initialized with zero elements. The VM has two configuration modes: continuations enabled and continuations disabled.
+When continuations are enabled, users can store values into the public output list via the `REVEAL_RV32` instruction.
+When continuations are disabled, users can store values into the public output list via the `PUBLISH` instruction.
 
 ## Notation
 
@@ -81,7 +95,7 @@ We support different address spaces of memory.
 We will always have the following fixed address spaces:
 
 | Address Space | Name          |
-| ------------- | ------------- |
+|---------------|---------------|
 | `0`           | Immediates    |
 | `1`           | Registers     |
 | `2`           | User Memory   |
@@ -90,10 +104,12 @@ We will always have the following fixed address spaces:
 
 Address space `0` is not a real address space: it is reserved for denoting immediates: We define `[a]_0 = a`.
 
-The number of address spaces supported is a configurable constant of the VM. The address spaces in `[as_offset, as_offset + 2^as_height)` are supported. By default `as_offset = 1` to preclude address space `0`.
+The number of address spaces supported is a configurable constant of the VM. The address spaces in
+`[as_offset, as_offset + 2^as_height)` are supported. By default `as_offset = 1` to preclude address space `0`.
 
 The size (= number of pointers) of each address space is also a configurable constant of the VM.
-The pointers can have values in `[0, 2^pointer_max_bits)`. We require `as_height, pointer_max_bits <= F::bits() - 2` due to a sorting argument.
+The pointers can have values in `[0, 2^pointer_max_bits)`. We require `as_height, pointer_max_bits <= F::bits() - 2` due
+to a sorting argument.
 
 > A memory cell in any address space is always a field element, but the VM _may_ later impose additional bit size
 > constraints on certain address spaces (e.g., everything in address space `2` must be a byte).
@@ -103,7 +119,7 @@ The pointers can have values in `[0, 2^pointer_max_bits)`. We require `as_height
 OpenVM depends on the following parameters, some of which are fixed and some of which are configurable:
 
 | Name               | Description                                                        | Constraints                                                           |
-| ------------------ | ------------------------------------------------------------------ | --------------------------------------------------------------------- |
+|--------------------|--------------------------------------------------------------------|-----------------------------------------------------------------------|
 | `F`                | The field over which the VM operates.                              | Currently fixed to Baby Bear, but may change to another 31-bit field. |
 | `PC_BITS`          | The number of bits in the program counter.                         | Fixed to 30.                                                          |
 | `DEFAULT_PC_STEP`  | The default program counter step size.                             | Fixed to 4.                                                           |
@@ -114,7 +130,8 @@ OpenVM depends on the following parameters, some of which are fixed and some of
 
 # OpenVM Instruction Set
 
-All instruction types are divided into classes, mostly based on purpose and nature of the operation (e.g., ALU instructions, U256 instructions, Modular arithmetic instructions, etc).
+All instruction types are divided into classes, mostly based on purpose and nature of the operation (e.g., ALU
+instructions, U256 instructions, Modular arithmetic instructions, etc).
 Instructions within each class are usually handled by the same chip, but this is not always the case (for example, if
 one of the operations requires much more trace columns than all others).
 Internally, certain non-intersecting ranges of opcodes (which are internally just a `usize`) are distributed among the
@@ -123,12 +140,15 @@ enabled operation classes, so that there is no collision between the classes.
 Operands marked with `_` are not used and should be set to zero. Trailing unused operands should also be set to zero.
 Unless otherwise specified, instructions will by default set `to_pc = from_pc + DEFAULT_PC_STEP`.
 
-The architecture is independent of RISC-V, but for transpilation purposes we specify additional information such as the RISC-V Opcode (7-bit), `funct3` (3-bit), and `funct7` (7-bit) or `imm` fields depending on the RISC-V instruction type.
+The architecture is independent of RISC-V, but for transpilation purposes we specify additional information such as the
+RISC-V Opcode (7-bit), `funct3` (3-bit), and `funct7` (7-bit) or `imm` fields depending on the RISC-V instruction type.
 
 We will use the following notation:
 
-- `u32(x)` where `x: [F; 4]` consists of 4 bytes will mean the casting from little-endian bytes in 2's complement to unsigned 32-bit integer.
-- `i32(x)` where `x: [F; 4]` consists of 4 bytes will mean the casting from little-endian bytes in 2's complement to signed 32-bit integer.
+- `u32(x)` where `x: [F; 4]` consists of 4 bytes will mean the casting from little-endian bytes in 2's complement to
+  unsigned 32-bit integer.
+- `i32(x)` where `x: [F; 4]` consists of 4 bytes will mean the casting from little-endian bytes in 2's complement to
+  signed 32-bit integer.
 - `sign_extend` means sign extension of bits in 2's complement.
 - `i32(c)` where `c` is a field element will mean `c.as_canonical_u32()` if `c.as_canonical_u32() < F::modulus() / 2` or
   `c.as_canonical_u32() - F::modulus() as i32` otherwise.
@@ -137,27 +157,35 @@ We will use the following notation:
 ## System
 
 | Name      | Operands  | Description                                                                                                        |
-| --------- | --------- | ------------------------------------------------------------------------------------------------------------------ |
+|-----------|-----------|--------------------------------------------------------------------------------------------------------------------|
 | TERMINATE | `_, _, c` | Terminates execution with exit code `c`. Sets `to_pc = from_pc`.                                                   |
 | PHANTOM   | `_, _, c` | Sets `to_pc = from_pc + DEFAULT_PC_STEP`. The operand `c` determines which phantom instruction (see below) is run. |
 
 ## RV32IM Support
 
-While the architecture allows creation of VMs without RISC-V support, we define a set of instructions that are meant to be transpiled from RISC-V instructions such that the resulting VM is able to run RISC-V ELF binaries. We use \_RV32 to specify that the operand parsing is specifically targeting 32-bit RISC-V registers.
+While the architecture allows creation of VMs without RISC-V support, we define a set of instructions that are meant to
+be transpiled from RISC-V instructions such that the resulting VM is able to run RISC-V ELF binaries. We use \_RV32 to
+specify that the operand parsing is specifically targeting 32-bit RISC-V registers.
 
-All instructions below assume that all memory cells in address spaces `1` and `2` are field elements that are in range `[0, 2^LIMB_BITS)` where `LIMB_BITS = 8`. The instructions must all ensure that any writes will uphold this constraint.
+All instructions below assume that all memory cells in address spaces `1` and `2` are field elements that are in range
+`[0, 2^LIMB_BITS)` where `LIMB_BITS = 8`. The instructions must all ensure that any writes will uphold this constraint.
 
-`x0` handling: Unlike in RISC-V, the instructions will **not** discard writes to `[0:4]_1` (corresponding to register `x0`). A valid transpilation of a RISC-V program can be inspected to have the properties:
+`x0` handling: Unlike in RISC-V, the instructions will **not** discard writes to `[0:4]_1` (corresponding to register
+`x0`). A valid transpilation of a RISC-V program can be inspected to have the properties:
 
 1. `[0:4]_1` has all zeroes in initial memory.
 2. No instruction in the program writes to `[0:4]_1`.
 
 ### ALU
 
-In all ALU instructions, the operand `d` is fixed to be `1`. The operand `e` must be either `0` or `1`. When `e = 0`, the `c` operand is expected to be of the form `F::from_canonical_u32(c_i16 as i24 as u24 as u32)` where `c_i16` is type `i16`. In other words we take signed 16-bits in two's complement, sign extend to 24-bits, consider the 24-bits as unsigned integer, and convert to field element. In the instructions below, `[c:4]_0` should be interpreted as `c_i16 as i32` sign extended to 32-bits.
+In all ALU instructions, the operand `d` is fixed to be `1`. The operand `e` must be either `0` or `1`. When `e = 0`,
+the `c` operand is expected to be of the form `F::from_canonical_u32(c_i16 as i24 as u24 as u32)` where `c_i16` is type
+`i16`. In other words we take signed 16-bits in two's complement, sign extend to 24-bits, consider the 24-bits as
+unsigned integer, and convert to field element. In the instructions below, `[c:4]_0` should be interpreted as
+`c_i16 as i32` sign extended to 32-bits.
 
 | Name      | Operands    | Description                                                                                              |
-| --------- | ----------- | -------------------------------------------------------------------------------------------------------- |
+|-----------|-------------|----------------------------------------------------------------------------------------------------------|
 | ADD_RV32  | `a,b,c,1,e` | `[a:4]_1 = [b:4]_1 + [c:4]_e`. Overflow is ignored and the lower 32-bits are written to the destination. |
 | SUB_RV32  | `a,b,c,1,e` | `[a:4]_1 = [b:4]_1 - [c:4]_e`. Overflow is ignored and the lower 32-bits are written to the destination. |
 | XOR_RV32  | `a,b,c,1,e` | `[a:4]_1 = [b:4]_1 ^ [c:4]_e`                                                                            |
@@ -173,13 +201,16 @@ In all ALU instructions, the operand `d` is fixed to be `1`. The operand `e` mus
 
 For all load/store instructions, we assume the operand `c` is in `[0, 2^16)`, and we fix address spaces `d = 1`.
 The address space `e` can be any [valid address space](#addressing).
-We will use shorthand `r32{c}(b) := i32([b:4]_1) + sign_extend(decompose(c)[0:2])` as `i32`. This means we interpret `c` as the 2's complement encoding of a 16-bit integer, sign extend it to 32-bits, and then perform signed 32-bit addition with the value of the register `[b:4]_1`.
-Memory access to `ptr: i32` is only valid if `0 <= ptr < 2^addr_max_bits`, in which case it is an access to `F::from_canonical_u32(ptr as u32)`.
+We will use shorthand `r32{c}(b) := i32([b:4]_1) + sign_extend(decompose(c)[0:2])` as `i32`. This means we interpret `c`
+as the 2's complement encoding of a 16-bit integer, sign extend it to 32-bits, and then perform signed 32-bit addition
+with the value of the register `[b:4]_1`.
+Memory access to `ptr: i32` is only valid if `0 <= ptr < 2^addr_max_bits`, in which case it is an access to
+`F::from_canonical_u32(ptr as u32)`.
 
 All load/store instructions always do block accesses of block size `4`, even for LOADB_RV32, STOREB_RV32.
 
 | Name        | Operands    | Description                                                                                                                    |
-| ----------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
+|-------------|-------------|--------------------------------------------------------------------------------------------------------------------------------|
 | LOADB_RV32  | `a,b,c,1,e` | `[a:4]_1 = sign_extend([r32{c}(b):1]_e)` Must sign-extend the byte read from memory, which is represented in 2’s complement.   |
 | LOADH_RV32  | `a,b,c,1,e` | `[a:4]_1 = sign_extend([r32{c}(b):2]_e)` Must sign-extend the number read from memory, which is represented in 2’s complement. |
 | LOADW_RV32  | `a,b,c,1,e` | `[a:4]_1 = [r32{c}(b):4]_e`                                                                                                    |
@@ -194,7 +225,7 @@ All load/store instructions always do block accesses of block size `4`, even for
 For branch instructions, we fix `d = e = 1`. For jump instructions, we fix `d = 1`.
 
 | Name       | Operands      | Description                                                                                                                                                                                                                                                                                                                       |
-| ---------- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|------------|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | BEQ_RV32   | `a,b,c,1,1`   | `if([a:4]_1 == [b:4]_1) pc += c`                                                                                                                                                                                                                                                                                                  |
 | BNE_RV32   | `a,b,c,1,1`   | `if([a:4]_1 != [b:4]_1) pc += c`                                                                                                                                                                                                                                                                                                  |
 | BLT_RV32   | `a,b,c,1,1`   | `if(i32([a:4]_1) < i32([b:4]_1)) pc += c`                                                                                                                                                                                                                                                                                         |
@@ -206,13 +237,21 @@ For branch instructions, we fix `d = e = 1`. For jump instructions, we fix `d =
 | LUI_RV32   | `a,_,c,1,_,1` | `[a:4]_1 = u32(c) << 12`. Here `i32(c)` must be in `[0, 2^20)`.                                                                                                                                                                                                                                                                   |
 | AUIPC_RV32 | `a,_,c,1,_,_` | `[a:4]_1 = decompose(pc) + (decompose(c) << 12)`. Here `i32(c)` must be in `[0, 2^20)`.                                                                                                                                                                                                                                           |
 
-For branch and JAL_RV32 instructions, the instructions assume that the operand `i32(c)` is in `[-2^24,2^24)`. The assignment `pc += c` is done as field elements.
-In valid programs, the `from_pc` is always in `[0, 2^PC_BITS)`. We will only use base field `F` satisfying `2^PC_BITS + 2*2^24 < F::modulus()` so `to_pc = from_pc + c` is only valid if `i32(from_pc) + i32(c)` is in `[0, 2^PC_BITS)`.
+For branch and JAL_RV32 instructions, the instructions assume that the operand `i32(c)` is in `[-2^24,2^24)`. The
+assignment `pc += c` is done as field elements.
+In valid programs, the `from_pc` is always in `[0, 2^PC_BITS)`. We will only use base field `F` satisfying
+`2^PC_BITS + 2*2^24 < F::modulus()` so `to_pc = from_pc + c` is only valid if `i32(from_pc) + i32(c)` is in
+`[0, 2^PC_BITS)`.
 
-For JALR_RV32, we treat `c` in `[0, 2^16)` as a raw encoding of 16-bits. Within the instruction, the 16-bits are interpreted in 2's complement and sign extended to 32-bits. Then it is added to the register value `i32([b:4]_1)`, where 32-bit overflow is ignored. The instruction is only valid if the resulting `i32` is in range `[0, 2^PC_BITS)`. The result is then cast to `u32` and then to `F` and assigned to `pc`.
+For JALR_RV32, we treat `c` in `[0, 2^16)` as a raw encoding of 16-bits. Within the instruction, the 16-bits are
+interpreted in 2's complement and sign extended to 32-bits. Then it is added to the register value `i32([b:4]_1)`, where
+32-bit overflow is ignored. The instruction is only valid if the resulting `i32` is in range `[0, 2^PC_BITS)`. The
+result is then cast to `u32` and then to `F` and assigned to `pc`.
 
-For LUI_RV32 and AUIPC_RV32, we are treating `c` in `[0, 2^20)` as a raw encoding of 20-bits. The instruction does not need to interpret whether the register is signed or unsigned.
-For AUIPC_RV32, the addition is treated as unchecked `u32` addition since that is the same as `i32` addition at the bit level.
+For LUI_RV32 and AUIPC_RV32, we are treating `c` in `[0, 2^20)` as a raw encoding of 20-bits. The instruction does not
+need to interpret whether the register is signed or unsigned.
+For AUIPC_RV32, the addition is treated as unchecked `u32` addition since that is the same as `i32` addition at the bit
+level.
 
 Note that AUIPC_RV32 does not have any condition for the register write.
 
@@ -225,12 +264,13 @@ the upper 32 bits of the full 2×32-bit product, for signed×signed, unsigned×u
 signed×unsigned multiplication respectively.
 
 DIV_RV32 and DIVU_RV32 perform signed and unsigned integer division of 32-bits by 32-bits. REM_RV32
-and REMU_RV32 provide the remainder of the corresponding division operation. Integer division is defined by `dividend = q * divisor + r` where `0 <= |r| < |divisor|` and either `sign(r) = sign(divisor)` or `r = 0`.
+and REMU_RV32 provide the remainder of the corresponding division operation. Integer division is defined by
+`dividend = q * divisor + r` where `0 <= |r| < |divisor|` and either `sign(r) = sign(divisor)` or `r = 0`.
 
 Below `x[n:m]` denotes the bits from `n` to `m` inclusive of `x`.
 
 | Name        | Operands  | Description                                                                                                                                                                                                |
-| ----------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|-------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | MUL_RV32    | `a,b,c,1` | `[a:4]_1 = ([b:4]_1 * [c:4]_1)[0:3]`                                                                                                                                                                       |
 | MULH_RV32   | `a,b,c,1` | `[a:4]_1 = (sign_extend([b:4]_1) * sign_extend([c:4]_1))[4:7]`. We sign extend `b` and `c` into 8-limb (i.e. 64-bit) integers                                                                              |
 | MULHSU_RV32 | `a,b,c,1` | `[a:4]_1 = (sign_extend([b:4]_1) * zero_extend([c:4]_1))[4:7]`. We sign extend                                                                                                                             |
@@ -258,35 +298,41 @@ Currently we have no need for `ECALLBREAK`, but we include it for future use.
 RV32 intrinsics are custom OpenVM opcodes that are designed to be compatible with the RV32 architecture.
 We continue to use \_RV32 to specify that the operand parsing is specifically targeting 32-bit RISC-V registers.
 
-All instructions below assume that all memory cells in address spaces `1` and `2` are field elements that are in range `[0, 2^LIMB_BITS)` where `LIMB_BITS = 8`. The instructions must all ensure that any writes will uphold this constraint.
+All instructions below assume that all memory cells in address spaces `1` and `2` are field elements that are in range
+`[0, 2^LIMB_BITS)` where `LIMB_BITS = 8`. The instructions must all ensure that any writes will uphold this constraint.
 
-We use the same notation for `r32{c}(b) := i32([b:4]_1) + sign_extend(decompose(c)[0:2])` as in [`LOADW_RV32` and `STOREW_RV32`](#loadstore).
+We use the same notation for `r32{c}(b) := i32([b:4]_1) + sign_extend(decompose(c)[0:2])` as in [`LOADW_RV32` and
+`STOREW_RV32`](#loadstore).
 
 ### User IO
 
-| Name            | Operands    | Description                                                                                                                         |
-| --------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| Name             | Operands    | Description                                                                                                                         |
+|------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------------|
 | HINT_STOREW_RV32 | `_,b,c,1,2` | `[r32{c}(b):4]_2 = next 4 bytes from hint stream`. Only valid if next 4 values in hint stream are bytes.                            |
-| REVEAL_RV32     | `a,b,c,1,3` | Pseudo-instruction for `STOREW_RV32 a,b,c,1,3` writing to the user IO address space `3`. Only valid when continuations are enabled. |
+| REVEAL_RV32      | `a,b,c,1,3` | Pseudo-instruction for `STOREW_RV32 a,b,c,1,3` writing to the user IO address space `3`. Only valid when continuations are enabled. |
 
 ### Hashes
 
 | Name           | Operands    | Description                                                                                                                                                              |
-| -------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+|----------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | KECCAK256_RV32 | `a,b,c,1,e` | `[r32{0}(a):32]_e = keccak256([r32{0}(b)..r32{0}(b)+r32{0}(c)]_e)`. Performs memory accesses with block size `4`.                                                        |
 | SHA256_RV32    | `a,b,c,1,2` | `[r32{0}(a):32]_2 = sha256([r32{0}(b)..r32{0}(b)+r32{0}(c)]_2)`. Does the necessary padding. Performs memory reads with block size `16` and writes with block size `32`. |
 
 ### 256-bit Integers
 
-The 256-bit ALU intrinsic instructions perform operations on 256-bit signed/unsigned integers where integer values are read/written from/to memory in address space `2`. The address space `2` pointer locations are obtained by reading register values in address space `1`. Note that these instructions are not the same as instructions on 256-bit registers.
+The 256-bit ALU intrinsic instructions perform operations on 256-bit signed/unsigned integers where integer values are
+read/written from/to memory in address space `2`. The address space `2` pointer locations are obtained by reading
+register values in address space `1`. Note that these instructions are not the same as instructions on 256-bit
+registers.
 
 For each instruction, the operand `d` is fixed to be `1` and `e` is fixed to be `2`.
-Each instruction performs block accesses with block size `4` in address space `1` and block size `32` in address space `2`.
+Each instruction performs block accesses with block size `4` in address space `1` and block size `32` in address space
+`2`.
 
 #### 256-bit ALU
 
 | Name         | Operands    | Description                                                                                                                          |
-| ------------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------ |
+|--------------|-------------|--------------------------------------------------------------------------------------------------------------------------------------|
 | ADD256_RV32  | `a,b,c,1,2` | `[r32{0}(a):32]_2 = [r32{0}(b):32]_2 + [r32{0}(c):32]_2`. Overflow is ignored and the lower 256-bits are written to the destination. |
 | SUB256_RV32  | `a,b,c,1,2` | `[r32{0}(a):32]_2 = [r32{0}(b):32]_2 - [r32{0}(c):32]_2`. Overflow is ignored and the lower 256-bits are written to the destination. |
 | XOR256_RV32  | `a,b,c,1,2` | `[r32{0}(a):32]_2 = [r32{0}(b):32]_2 ^ [r32{0}(c):32]_2`                                                                             |
@@ -301,7 +347,7 @@ Each instruction performs block accesses with block size `4` in address space `1
 #### 256-bit Branch
 
 | Name         | Operands    | Description                                                    |
-| ------------ | ----------- | -------------------------------------------------------------- |
+|--------------|-------------|----------------------------------------------------------------|
 | BEQ256_RV32  | `a,b,c,1,2` | `if([r32{0}(a):32]_2 == [r32{0}(b):32]_2) pc += c`             |
 | BNE256_RV32  | `a,b,c,1,2` | `if([r32{0}(a):32]_2 != [r32{0}(b):32]_2) pc += c`             |
 | BLT256_RV32  | `a,b,c,1,2` | `if(i256([r32{0}(a):32]_2) < i256([r32{0}(b):32]_2)) pc += c`  |
@@ -315,22 +361,33 @@ Multiplication performs 256-bit×256-bit multiplication and writes the lower 256
 Below `x[n:m]` denotes the bits from `n` to `m` inclusive of `x`.
 
 | Name        | Operands    | Description                                                       |
-| ----------- | ----------- | ----------------------------------------------------------------- |
+|-------------|-------------|-------------------------------------------------------------------|
 | MUL256_RV32 | `a,b,c,1,2` | `[r32{0}(a):32]_2 = ([r32{0}(b):32]_2 * [r32{0}(c):32]_2)[0:255]` |
 
 ### Modular Arithmetic
 
-The VM can be configured to support intrinsic instructions for modular arithmetic. The VM configuration will specify a list of supported moduli. For each positive integer modulus `N` there will be associated configuration parameters `N::NUM_LIMBS` and `N::BLOCK_SIZE` (defined below). For each modulus `N`, the instructions below are supported.
+The VM can be configured to support intrinsic instructions for modular arithmetic. The VM configuration will specify a
+list of supported moduli. For each positive integer modulus `N` there will be associated configuration parameters
+`N::NUM_LIMBS` and `N::BLOCK_SIZE` (defined below). For each modulus `N`, the instructions below are supported.
 
-The instructions perform operations on unsigned big integers representing elements in the modulus. The big integer values are read/written from/to memory in address space `2`. The address space `2` pointer locations are obtained by reading register values in address space `1`.
+The instructions perform operations on unsigned big integers representing elements in the modulus. The big integer
+values are read/written from/to memory in address space `2`. The address space `2` pointer locations are obtained by
+reading register values in address space `1`.
 
-An element in the ring of integers modulo `N`is represented as an unsigned big integer with `N::NUM_LIMBS` limbs with each limb having `LIMB_BITS = 8` bits. For each instruction, the input elements `[r32{0}(b): N::NUM_LIMBS]_2, [r32{0}(c):N::NUM_LIMBS]_2` are assumed to be unsigned big integers in little-endian format with each limb having `LIMB_BITS` bits. However, the big integers are **not** required to be less than `N`. Under these conditions, the output element `[r32{0}(a): N::NUM_LIMBS]_2` written to memory will be an unsigned big integer of the same format that is congruent modulo `N` to the respective operation applied to the two inputs.
+An element in the ring of integers modulo `N`is represented as an unsigned big integer with `N::NUM_LIMBS` limbs with
+each limb having `LIMB_BITS = 8` bits. For each instruction, the input elements
+`[r32{0}(b): N::NUM_LIMBS]_2, [r32{0}(c):N::NUM_LIMBS]_2` are assumed to be unsigned big integers in little-endian
+format with each limb having `LIMB_BITS` bits. However, the big integers are **not** required to be less than `N`. Under
+these conditions, the output element `[r32{0}(a): N::NUM_LIMBS]_2` written to memory will be an unsigned big integer of
+the same format that is congruent modulo `N` to the respective operation applied to the two inputs.
 
 For each instruction, the operand `d` is fixed to be `1` and `e` is fixed to be `2`.
-Each instruction performs block accesses with block size `4` in address space `1` and block size `N::BLOCK_SIZE` in address space `2`, where `N::NUM_LIMBS` is divisible by `N::BLOCK_SIZE`. Recall that `N::BLOCK_SIZE` must be a power of 2.
+Each instruction performs block accesses with block size `4` in address space `1` and block size `N::BLOCK_SIZE` in
+address space `2`, where `N::NUM_LIMBS` is divisible by `N::BLOCK_SIZE`. Recall that `N::BLOCK_SIZE` must be a power of
+2.
 
 | Name                      | Operands    | Description                                                                                                                                                                                                |
-| ------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|---------------------------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | ADDMOD_RV32\<N\>          | `a,b,c,1,2` | `[r32{0}(a): N::NUM_LIMBS]_2 = [r32{0}(b): N::NUM_LIMBS]_2 + [r32{0}(c): N::NUM_LIMBS]_2 (mod N)`                                                                                                          |
 | SUBMOD_RV32\<N\>          | `a,b,c,1,2` | `[r32{0}(a): N::NUM_LIMBS]_2 = [r32{0}(b): N::NUM_LIMBS]_2 - [r32{0}(c): N::NUM_LIMBS]_2 (mod N)`                                                                                                          |
 | SETUP_ADDSUBMOD_RV32\<N\> | `a,b,c,1,2` | `assert([r32{0}(b): N::NUM_LIMBS]_2 == N)` for the chip that handles add and sub. For the sake of implementation convenience it also writes something (can be anything) into `[r32{0}(a): N::NUM_LIMBS]_2` |
@@ -340,18 +397,27 @@ Each instruction performs block accesses with block size `4` in address space `1
 
 ### Modular Branching
 
-The configuration of `N` is the same as above. For each instruction, the input elements `[r32{0}(b): N::NUM_LIMBS]_2, [r32{0}(c): N::NUM_LIMBS]_2` are assumed to be unsigned big integers in little-endian format with each limb having `LIMB_BITS` bits.
+The configuration of `N` is the same as above. For each instruction, the input elements
+`[r32{0}(b): N::NUM_LIMBS]_2, [r32{0}(c): N::NUM_LIMBS]_2` are assumed to be unsigned big integers in little-endian
+format with each limb having `LIMB_BITS` bits.
 
 | Name                    | Operands    | Description                                                                                                                                                                                                                                                                                         |
-| ----------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|-------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | ISEQMOD_RV32\<N\>       | `a,b,c,1,2` | `[a:4]_1 = [r32{0}(b): N::NUM_LIMBS]_2 == [r32{0}(c): N::NUM_LIMBS]_2 (mod N) ? 1 : 0`. Enforces that `[r32{0}(b): N::NUM_LIMBS]_2, [r32{0}(c): N::NUM_LIMBS]_2` are less than `N` and then sets the register value of `[a:4]_1` to `1` or `0` depending on whether the two big integers are equal. |
 | SETUP_ISEQMOD_RV32\<N\> | `a,b,c,1,2` | `assert([r32{0}(b): N::NUM_LIMBS]_2 == N)` in the chip that handles modular equality. For the sake of implementation convenience it also writes something (can be anything) into register value of `[a:4]_1`                                                                                        |
 
 ### Short Weierstrass Elliptic Curve Arithmetic
 
-The VM can be configured to support intrinsic instructions for elliptic curves `C` in short Weierstrass form given by equation `C: y^2 = x^3 + C::B` where `C::B` is a constant of the coordinate field. We note that the definitions of the curve arithmetic operations do not depend on `C::B`. The VM configuration will specify a list of supported curves. For each short Weierstrass curve `C` there will be associated configuration parameters `C::COORD_SIZE` and `C::BLOCK_SIZE` (defined below). For each curve `C`, the instructions below are supported.
+The VM can be configured to support intrinsic instructions for elliptic curves `C` in short Weierstrass form given by
+equation `C: y^2 = x^3 + C::B` where `C::B` is a constant of the coordinate field. We note that the definitions of the
+curve arithmetic operations do not depend on `C::B`. The VM configuration will specify a list of supported curves. For
+each short Weierstrass curve `C` there will be associated configuration parameters `C::COORD_SIZE` and `C::BLOCK_SIZE` (
+defined below). For each curve `C`, the instructions below are supported.
 
-An affine curve point `EcPoint(x, y)` is a pair of `x,y` where each element is an array of `C::COORD_SIZE` elements each with `LIMB_BITS = 8` bits. When the coordinate field `C::Fp` of `C` is prime, the format of `x,y` is guaranteed to be the same as the format used in the [modular arithmetic instructions](#modular-arithmetic). A curve point will be represented as `2 * C::COORD_SIZE` contiguous cells in memory.
+An affine curve point `EcPoint(x, y)` is a pair of `x,y` where each element is an array of `C::COORD_SIZE` elements each
+with `LIMB_BITS = 8` bits. When the coordinate field `C::Fp` of `C` is prime, the format of `x,y` is guaranteed to be
+the same as the format used in the [modular arithmetic instructions](#modular-arithmetic). A curve point will be
+represented as `2 * C::COORD_SIZE` contiguous cells in memory.
 
 We use the following notation below:
 
@@ -364,7 +430,7 @@ r32_ec_point(a) -> EcPoint {
 ```
 
 | Name                 | Operands    | Description                                                                                                                                                                                                                                                                                    |
-| -------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|----------------------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | SW_ADD_NE\<C\>       | `a,b,c,1,2` | Set `r32_ec_point(a) = r32_ec_point(b) + r32_ec_point(c)` (curve addition). Assumes that `r32_ec_point(b), r32_ec_point(c)` both lie on the curve and are not the identity point. Further assumes that `r32_ec_point(b).x, r32_ec_point(c).x` are not equal in the coordinate field.           |
 | SETUP_SW_ADD_NE\<C\> | `a,b,c,1,2` | `assert(r32_ec_point(b).x == C::MODULUS)` in the chip for EC ADD. For the sake of implementation convenience it also writes something (can be anything) into `[r32{0}(a): 2*C::COORD_SIZE]_2`. It is required for proper functionality that `assert(r32_ec_point(b).x != r32_ec_point(c).x)`   |
 | SW_DOUBLE\<C\>       | `a,b,_,1,2` | Set `r32_ec_point(a) = 2 * r32_ec_point(b)`. This doubles the input point. Assumes that `r32_ec_point(b)` lies on the curve and is not the identity point.                                                                                                                                     |
@@ -372,10 +438,13 @@ r32_ec_point(a) -> EcPoint {
 
 ### Complex Extension Field
 
-The VM can be configured to support intrinsic instructions for complex extension fields of prime fields. A complex extension field `Fp2` is the quadratic extension of a prime field `Fp` with irreducible polynomial `X^2 + 1`. An element in `Fp2` is a pair `c0: Fp, c1: Fp` such that `c0 + c1 u`
+The VM can be configured to support intrinsic instructions for complex extension fields of prime fields. A complex
+extension field `Fp2` is the quadratic extension of a prime field `Fp` with irreducible polynomial `X^2 + 1`. An element
+in `Fp2` is a pair `c0: Fp, c1: Fp` such that `c0 + c1 u`
 represents a point in `Fp2` where `u^2 = -1`.
 
-The VM will only be configured for `Fp2` if the modular arithmetic instructions for `Fp::MODULUS` are also configured. The memory layout of `Fp2` is then that of two concatenated `Fp` elements,
+The VM will only be configured for `Fp2` if the modular arithmetic instructions for `Fp::MODULUS` are also configured.
+The memory layout of `Fp2` is then that of two concatenated `Fp` elements,
 and the block size for memory accesses is set to equal the block size of `Fp`.
 
 We use the following notation below:
@@ -389,7 +458,7 @@ r32_fp2(a) -> Fp2 {
 ```
 
 | Name                | Operands    | Description                                                                                                                                                                  |
-| ------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|---------------------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | ADD\<Fp2\>          | `a,b,c,1,2` | Set `r32_fp2(a) = r32_fp2(b) + r32_fp2(c)`                                                                                                                                   |
 | SUB\<Fp2\>          | `a,b,c,1,2` | Set `r32_fp2(a) = r32_fp2(b) - r32_fp2(c)`                                                                                                                                   |
 | SETUP_ADDSUB\<Fp2\> | `a,b,c,1,2` | `assert([r32_fp2(b).c0 == N)` for the chip that handles add and sub. For the sake of implementation convenience it also writes something (can be anything) into `r32_fp2(a)` |
@@ -400,13 +469,17 @@ r32_fp2(a) -> Fp2 {
 ### Optimal Ate Pairing
 
 The VM can be configured to enable intrinsic instructions for accelerating the optimal Ate pairing.
-Currently the supported pairing friendly elliptic curves are BN254 and BLS12-381, which both have embedding degree 12. For more detailed descriptions of the instructions, refer to [this](https://hackmd.io/NjMhWt1HTDOB7TIKmTOMFw?view). For curve `C` to be supported, the VM must have
+Currently the supported pairing friendly elliptic curves are BN254 and BLS12-381, which both have embedding degree 12.
+For more detailed descriptions of the instructions, refer to [this](https://hackmd.io/NjMhWt1HTDOB7TIKmTOMFw?view). For
+curve `C` to be supported, the VM must have
 enabled instructions for `C::Fp` and `C::Fp2`. The memory block size is `C::Fp::BLOCK_SIZE` for both reads and writes.
 
-We lay out `Fp12` in memory as `c0, ..., c5` where `c_i: Fp2` and the `Fp12` element is `c0 + c1 w + ... + c5 w^5` where `w^6 = C::XI` in `Fp2`, where `C::Xi: Fp2` is an associated constant. Both `UnevaluatedLine<Fp2>` and `EvaluatedLine<Fp2>` are laid out in memory the same as `[Fp2; 2]`.
+We lay out `Fp12` in memory as `c0, ..., c5` where `c_i: Fp2` and the `Fp12` element is `c0 + c1 w + ... + c5 w^5` where
+`w^6 = C::XI` in `Fp2`, where `C::Xi: Fp2` is an associated constant. Both `UnevaluatedLine<Fp2>` and
+`EvaluatedLine<Fp2>` are laid out in memory the same as `[Fp2; 2]`.
 
 | Name                            | Operands    | Description                                                                                                                                                                                                                                                                                                                |
-| ------------------------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|---------------------------------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | MILLER_DOUBLE_STEP\<C\>         | `a,b,_,1,2` | Let `S: EcPoint<Fp2>` be read starting from `[r32{0}(b)]_2`. The output `miller_double_step(S): (EcPoint<Fp2>, UnevaluatedLine<Fp2>)` is written contiguously to memory starting at `[r32{0}(a)]_2`.                                                                                                                       |
 | MILLER_DOUBLE_AND_ADD_STEP\<C\> | `a,b,c,1,2` | Let `S: EcPoint<Fp2>` be read starting from `[r32{0}(b)]_2` and `Q: EcPoint<Fp2>` be read starting from `[r32{0}(c)]_2`. The output `miller_double_and_add_step(S, Q): (EcPoint<Fp2>, UnevaluatedLine<Fp2>, UnevaluatedLine<Fp2>)` is written contiguously to memory starting at `[r32{0}(a)]_2`.                          |
 | FP12_MUL\<C\>                   | `a,b,c,1,2` | Set `r32_fp12(a) = r32_fp12(b) * r32_fp12(c)` where `r32_fp12(a)` is 6 `Fp2` elements laid out contiguously in memory starting at `[r32{0}(a)]_2`.                                                                                                                                                                         |
@@ -418,33 +491,38 @@ We lay out `Fp12` in memory as `c0, ..., c5` where `c_i: Fp2` and the `Fp12` ele
 
 ## Native Kernel
 
-The native kernel instructions were adapted from [Valida](https://github.com/valida-xyz/valida-compiler/issues/2) with changes to the
+The native kernel instructions were adapted from [Valida](https://github.com/valida-xyz/valida-compiler/issues/2) with
+changes to the
 instruction format suggested by Max Gillet to enable easier compatibility with other existing ISAs.
 
 ### Base
 
-In the instructions below, `d,e` may be any valid address space unless otherwise specified. In particular, the immediate address space `0` is allowed for non-vectorized reads but not allowed for writes. When using immediates, we interpret `[a]_0` as the immediate value `a`. Base kernel instructions enable memory movement between address spaces.
-
-| Name           | Operands        | Description                                                                                                                                                                                                                                                                                                               |
-| -------------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| LOADW      | `a,b,c,d,e`     | Set `[a]_d = [[c]_d + b]_e`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                            |
-| STOREW     | `a,b,c,d,e`     | Set `[[c]_d + b]_e = [a]_d`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                            |
-| LOADW4     | `a,b,c,d,e`     | Set `[a:4]_d = [[c]_d + b:4]_e`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                            |
-| STOREW4    | `a,b,c,d,e`     | Set `[[c]_d + b:4]_e = [a:4]_d`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                            |
-| JAL            | `a,b,c,d`       | Jump to address and link: set `[a]_d = (pc + DEFAULT_PC_STEP)` and `pc = pc + b`. Here `d` must be non-zero.                                                                                                                                                                                                           |
-| BEQ\<W\>       | `a,b,c,d,e`     | If `[a:W]_d == [b:W]_e`, then set `pc = pc + c`.                                                                                                                                                                                                                                            |
-| BNE\<W\>       | `a,b,c,d,e`     | If `[a:W]_d != [b:W]_e`, then set `pc = pc + c`.                                                                                                                                                                                                                                            |
-| HINT_STOREW | `_,b,c,d,e`     | Set `[[c]_d + b]_e = next element from hint stream`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                   |
-| HINT_STOREW4 | `_,b,c,d,e`     | Set `[[c]_d + b:4]_e = next 4 elements from hint stream`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                   |
-| PUBLISH        | `a,b,_,d,e`     | Set the user public output at index `[a]_d` to equal `[b]_e`. Invalid if `[a]_d` is greater than or equal to the configured length of user public outputs. Only valid when continuations are disabled.                                                                                                                    |
-| CASTF          | `a,b,_,d,e`     | Cast a field element represented as `u32` into four bytes in little-endian: Set `[a:4]_d` to the unique array such that `sum_{i=0}^3 [a + i]_d * 2^{8i} = [b]_e` where `[a + i]_d < 2^8` for `i = 0..2` and `[a + 3]_d < 2^6`. This opcode constrains that `[b]_e` must be at most 30-bits. Both `d, e` must be non-zero. |
+In the instructions below, `d,e` may be any valid address space unless otherwise specified. In particular, the immediate
+address space `0` is allowed for non-vectorized reads but not allowed for writes. When using immediates, we interpret
+`[a]_0` as the immediate value `a`. Base kernel instructions enable memory movement between address spaces.
+
+| Name         | Operands    | Description                                                                                                                                                                                                                                                                                                               |
+|--------------|-------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| LOADW        | `a,b,c,d,e` | Set `[a]_d = [[c]_d + b]_e`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                                |
+| STOREW       | `a,b,c,d,e` | Set `[[c]_d + b]_e = [a]_d`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                                |
+| LOADW4       | `a,b,c,d,e` | Set `[a:4]_d = [[c]_d + b:4]_e`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                            |
+| STOREW4      | `a,b,c,d,e` | Set `[[c]_d + b:4]_e = [a:4]_d`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                                            |
+| JAL          | `a,b,c,d`   | Jump to address and link: set `[a]_d = (pc + DEFAULT_PC_STEP)` and `pc = pc + b`. Here `d` must be non-zero.                                                                                                                                                                                                              |
+| BEQ\<W\>     | `a,b,c,d,e` | If `[a:W]_d == [b:W]_e`, then set `pc = pc + c`.                                                                                                                                                                                                                                                                          |
+| BNE\<W\>     | `a,b,c,d,e` | If `[a:W]_d != [b:W]_e`, then set `pc = pc + c`.                                                                                                                                                                                                                                                                          |
+| HINT_STOREW  | `_,b,c,d,e` | Set `[[c]_d + b]_e = next element from hint stream`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                        |
+| HINT_STOREW4 | `_,b,c,d,e` | Set `[[c]_d + b:4]_e = next 4 elements from hint stream`. Both `d, e` must be non-zero.                                                                                                                                                                                                                                   |
+| PUBLISH      | `a,b,_,d,e` | Set the user public output at index `[a]_d` to equal `[b]_e`. Invalid if `[a]_d` is greater than or equal to the configured length of user public outputs. Only valid when continuations are disabled.                                                                                                                    |
+| CASTF        | `a,b,_,d,e` | Cast a field element represented as `u32` into four bytes in little-endian: Set `[a:4]_d` to the unique array such that `sum_{i=0}^3 [a + i]_d * 2^{8i} = [b]_e` where `[a + i]_d < 2^8` for `i = 0..2` and `[a + 3]_d < 2^6`. This opcode constrains that `[b]_e` must be at most 30-bits. Both `d, e` must be non-zero. |
 
 ### Native Field Arithmetic
 
-This instruction set does native field operations. Below, `e,f` may be any valid address space, `d` may be any valid non-zero address space. When either `e` or `f` is zero, `[b]_0` and `[c]_0` should be interpreted as the immediates `b` and `c`, respectively.
+This instruction set does native field operations. Below, `e,f` may be any valid address space, `d` may be any valid
+non-zero address space. When either `e` or `f` is zero, `[b]_0` and `[c]_0` should be interpreted as the immediates `b`
+and `c`, respectively.
 
 | Name | Operands      | Description                                               |
-| ---- | ------------- | --------------------------------------------------------- |
+|------|---------------|-----------------------------------------------------------|
 | ADDF | `a,b,c,d,e,f` | Set `[a]_d = [b]_e + [c]_f`.                              |
 | SUBF | `a,b,c,d,e,f` | Set `[a]_d = [b]_e - [c]_f`.                              |
 | MULF | `a,b,c,d,e,f` | Set `[a]_d = [b]_e * [c]_f`.                              |
@@ -454,14 +532,15 @@ This instruction set does native field operations. Below, `e,f` may be any valid
 
 #### BabyBear Quartic Extension Field
 
-This is only enabled when the native field is `BabyBear`. The quartic extension field is defined by the irreducible polynomial $x^4 - 11$ (this choice matches Plonky3, but we note that Risc0 uses the polynomial $x^4 + 11$ instead).
+This is only enabled when the native field is `BabyBear`. The quartic extension field is defined by the irreducible
+polynomial $x^4 - 11$ (this choice matches Plonky3, but we note that Risc0 uses the polynomial $x^4 + 11$ instead).
 All elements in the field extension can be represented as a vector `[a_0,a_1,a_2,a_3]` which represents the
 polynomial $a_0 + a_1x + a_2x^2 + a_3x^3$ over `BabyBear`.
 
 Below, `d,e` may be any valid non-zero address space. The instructions do block access with block size `4`.
 
 | Name    | Operands  | Description                                                                                   |
-| ------- | --------- | --------------------------------------------------------------------------------------------- |
+|---------|-----------|-----------------------------------------------------------------------------------------------|
 | FE4ADD  | `a, b, c` | Set `[a:4]_d = [b:4]_d + [c:4]_e` with vector addition.                                       |
 | FE4SUB  | `a, b, c` | Set `[a:4]_d = [b:4]_d - [c:4]_e` with vector subtraction.                                    |
 | BBE4MUL | `a, b, c` | Set `[a:4]_d = [b:4]_d * [c:4]_e` with extension field multiplication.                        |
@@ -472,10 +551,11 @@ Below, `d,e` may be any valid non-zero address space. The instructions do block
 We have special opcodes to enable different precompiled hash functions.
 Only subsets of these opcodes will be turned on depending on the VM use case.
 
-Below, `d,e` may be any valid address space, and `d,e` are both not allowed to be zero. The instructions do block access with block size `1` in address space `d` and block size `CHUNK` in address space `e`.
+Below, `d,e` may be any valid address space, and `d,e` are both not allowed to be zero. The instructions do block access
+with block size `1` in address space `d` and block size `CHUNK` in address space `e`.
 
 | Name                                                                                                                                                                                                                               | Operands    | Description                                                                                                                                                                                                                                                                                                                                                      |
-| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | **COMPRESS_POSEIDON2** `[CHUNK, PID]` <br/><br/> Here `CHUNK` and `PID` are **constants** that determine different opcodes. `PID` is an internal identifier for particular Poseidon2 constants dependent on the field (see below). | `a,b,c,d,e` | Applies the Poseidon2 compression function to the inputs `[[b]_d:CHUNK]_e` and `[[c]_d:CHUNK]_e`, writing the result to `[[a]_d:CHUNK]_e`.                                                                                                                                                                                                                       |
 | **PERM_POSEIDON2** `[WIDTH, PID]`                                                                                                                                                                                                  | `a,b,_,d,e` | Applies the Poseidon2 permutation function to `[[b]_d:WIDTH]_e` and writes the result to `[[a]_d:WIDTH]_e`. <br/><br/> Each array of `WIDTH` elements is read/written in two batches of size `CHUNK`. This is nearly the same as `COMPRESS_POSEIDON2` except that the whole input state is contiguous in memory, and the full output state is written to memory. |
 
@@ -483,7 +563,7 @@ For Poseidon2, the `PID` is just some identifier to provide domain separation be
 now we can set:
 
 | `PID` | Description                                                                                                                                                                                                                                                         |
-| ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | 0     | [`POSEIDON2_BABYBEAR_16_PARAMS`](https://github.com/HorizenLabs/poseidon2/blob/bb476b9ca38198cf5092487283c8b8c5d4317c4e/plain_implementations/src/poseidon2/poseidon2_instance_babybear.rs#L2023C20-L2023C48) but the Mat4 used is Plonky3's with a Monty reduction |
 
 and only support `CHUNK = 8` and `WIDTH = 16` in BabyBear Poseidon2 above. For this setting, the input (of size `WIDTH`)
@@ -492,20 +572,25 @@ size `CHUNK`, depending on the output size of the corresponding opcode.
 
 ### Verification
 
-We have the following special opcode tailored to optimize verification. Due to already having a large number of operands,
+We have the following special opcode tailored to optimize verification. Due to already having a large number of
+operands,
 the address space is fixed to be `AS::Native = 5`.
 
 | Name                                                                                                                                                                                                                         | Operands        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | **VERIFY_BATCH** `[CHUNK, PID]` <br/><br/> Here `CHUNK` and `PID` are **constants** that determine different opcodes. `PID` is an internal identifier for particular Poseidon2 constants dependent on the field (see below). | `a,b,c,d,e,f,g` | Computes `mmcs::verify_batch`. In the native address space, `[a], [b], [d], [e], [f]` should be the array start pointers for the dimensions array, the opened values array (which contains more arrays), the proof (which contains arrays of length `CHUNK`) and the commitment (which is an array of length `CHUNK`). `[c]` should be the length of the opened values array (and so should be equal to the length of the dimensions array as well). `g` should be the reciprocal of the size (in field elements) of the values contained in the opened values array: if the opened values array contains field elements, `g` should be 1; if the opened values array contains extension field elements, `g` should be 1/4. |
+| **FRI_REDUCED_OPENING**                                                                                                                                                                                                      | `a,b,c,d,e,f`   | Let `length = [e]_d`, `a_ptr = [a]_d`, `b_ptr = [b]_d`, `alpha = [f:EXT_DEG]_d`. `a_ptr` is the address of Felt array `a_arr` and `b_ptr` is the address of Ext array `b_arr`. Compute `sum((b_arr[i] - a_arr[i]) * alpha ^ i)` for `i=0..length` and write the value into `[c:EXT_DEG]_d`.                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 
 ## Phantom Sub-Instructions
 
 As mentioned in [System](#system), the **PHANTOM** instruction has different behavior based on the operand `c`.
-More specifically, the low 16-bits `c.as_canonical_u32() & 0xffff` are used as the discriminant to determine a phantom sub-instruction. We list the phantom sub-instructions below. Phantom sub-instructions are only allowed to use operands `a,b` and `c_upper = c.as_canonical_u32() >> 16`. Besides the description below, recall that the phantom instruction always advances the program counter by `DEFAULT_PC_STEP`.
+More specifically, the low 16-bits `c.as_canonical_u32() & 0xffff` are used as the discriminant to determine a phantom
+sub-instruction. We list the phantom sub-instructions below. Phantom sub-instructions are only allowed to use operands
+`a,b` and `c_upper = c.as_canonical_u32() >> 16`. Besides the description below, recall that the phantom instruction
+always advances the program counter by `DEFAULT_PC_STEP`.
 
 | Name                      | Discriminant | Operands      | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
-| ------------------------- | ------------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+|---------------------------|--------------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Nop                       | 0x00         | `_`           | Does nothing.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | DebugPanic                | 0x01         | `_`           | Causes the runtime to panic on the host machine and prints a backtrace if `RUST_BACKTRACE=1` is set.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 | CtStart                   | 0x02         | `_`           | Opens a new span for tracing.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
diff --git a/extensions/native/circuit/src/utils.rs b/extensions/native/circuit/src/utils.rs
index 09635e1dc3..0c188998c7 100644
--- a/extensions/native/circuit/src/utils.rs
+++ b/extensions/native/circuit/src/utils.rs
@@ -7,8 +7,7 @@ use crate::{Native, NativeConfig};
 pub fn execute_program(program: Program<BabyBear>, input_stream: impl Into<Streams<BabyBear>>) {
     let system_config = SystemConfig::default()
         .with_public_values(4)
-        .with_max_segment_len((1 << 25) - 100)
-        .with_profiling();
+        .with_max_segment_len((1 << 25) - 100);
     let config = NativeConfig::new(system_config, Native);
     let executor = VmExecutor::<BabyBear, NativeConfig>::new(config);
 
diff --git a/extensions/native/compiler/tests/fri_ro_eval.rs b/extensions/native/compiler/tests/fri_ro_eval.rs
index 30c4c2f726..df012e8ab5 100644
--- a/extensions/native/compiler/tests/fri_ro_eval.rs
+++ b/extensions/native/compiler/tests/fri_ro_eval.rs
@@ -54,23 +54,11 @@ fn test_single_reduced_opening_eval() {
     });
     let expected_result = cur_ro;
 
-    // prints don't work?
-    /*builder.print_e(expected_result);
-    builder.print_e(expected_final_alpha_pow);
-
-    let two = builder.constant(F::TWO);
-    builder.print_f(two);
-    let ext_1210 = builder.constant(EF::from_base_slice(&[F::ONE, F::TWO, F::ONE, F::ZERO]));
-    builder.print_e(ext_1210);*/
-
     builder.assign(&cur_alpha_pow, initial_alpha_pow);
     let single_ro_eval_res = builder.fri_single_reduced_opening_eval(alpha, &mat_opening, &ps_at_z);
     let actual_result: Ext<_, _> = builder.uninit();
     builder.assign(&actual_result, single_ro_eval_res * cur_alpha_pow / (z - x));
 
-    //builder.print_e(actual_result);
-    //builder.print_e(actual_final_alpha_pow);
-
     builder.assert_ext_eq(expected_result, actual_result);
 
     builder.halt();
@@ -78,7 +66,6 @@ fn test_single_reduced_opening_eval() {
     let mut compiler = AsmCompiler::new(1);
     compiler.build(builder.operations);
     let asm_code = compiler.code();
-    // println!("{}", asm_code);
 
     let program = convert_program::<F, EF>(asm_code, CompilerOptions::default());
     execute_program(program, vec![]);

From 4fee261fa7ec545bd859ec43ad333afc8b527af4 Mon Sep 17 00:00:00 2001
From: Xinding Wei <weixinding@gmail.com>
Date: Thu, 23 Jan 2025 14:38:21 +0800
Subject: [PATCH 3/6] Reduce degree of FriReducedOpeningChip to 3

---
 extensions/native/circuit/src/fri/mod.rs | 63 ++++++++++++------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/extensions/native/circuit/src/fri/mod.rs b/extensions/native/circuit/src/fri/mod.rs
index fb3792996e..69fc28cfaa 100644
--- a/extensions/native/circuit/src/fri/mod.rs
+++ b/extensions/native/circuit/src/fri/mod.rs
@@ -108,7 +108,9 @@ const_assert_eq!(OVERALL_WIDTH, 26);
 #[repr(C)]
 #[derive(Debug, AlignedBorrow)]
 struct GeneralCols<T> {
-    enabled: T,
+    /// Whether the row is a workload row.
+    is_workload_row: T,
+    /// Whether the row is an instruction row.
     is_ins_row: T,
     timestamp: T,
 }
@@ -179,13 +181,16 @@ impl FriReducedOpeningAir {
     ) {
         let local: &GeneralCols<AB::Var> = local_slice[..GENERAL_WIDTH].borrow();
         let next: &GeneralCols<AB::Var> = next_slice[..GENERAL_WIDTH].borrow();
-        builder.assert_bool(local.enabled);
         builder.assert_bool(local.is_ins_row);
+        builder.assert_bool(local.is_workload_row);
+        // A row can either be an instruction row or a workload row.
+        builder.assert_bool(local.is_ins_row + local.is_workload_row);
         {
             // All enabled rows must be before disabled rows.
             let mut when_transition = builder.when_transition();
-            let mut when_disabled = when_transition.when_ne(local.enabled, AB::Expr::ONE);
-            when_disabled.assert_zero(next.enabled);
+            let mut when_disabled =
+                when_transition.when_ne(local.is_ins_row + local.is_workload_row, AB::Expr::ONE);
+            when_disabled.assert_zero(next.is_ins_row + next.is_workload_row);
         }
     }
 
@@ -199,8 +204,7 @@ impl FriReducedOpeningAir {
         let next: &PrefixCols<AB::Var> = next_slice[..PREFIX_WIDTH].borrow();
         let local_data = &local.prefix.data;
         let start_timestamp = next.general.timestamp;
-        let multiplicity =
-            local.prefix.general.enabled * (AB::Expr::ONE - local.prefix.general.is_ins_row);
+        let multiplicity = local.prefix.general.is_workload_row;
         // a_ptr/b_ptr/length/result
         let ptr_reads = AB::F::from_canonical_usize(4);
         // read a
@@ -211,7 +215,7 @@ impl FriReducedOpeningAir {
                 start_timestamp + ptr_reads,
                 &local.a_aux,
             )
-            .eval(builder, multiplicity.clone());
+            .eval(builder, multiplicity);
         // read b
         self.memory_bridge
             .read(
@@ -223,9 +227,7 @@ impl FriReducedOpeningAir {
             .eval(builder, multiplicity);
         {
             let mut when_transition = builder.when_transition();
-            let mut not_ins_row =
-                when_transition.when_ne(local.prefix.general.is_ins_row, AB::F::ONE);
-            let mut builder = not_ins_row.when(next.general.enabled);
+            let mut builder = when_transition.when(local.prefix.general.is_workload_row);
             // ATTENTION: degree of builder is 2
             // local.timestamp = next.timestamp + 2
             builder.assert_eq(
@@ -270,9 +272,10 @@ impl FriReducedOpeningAir {
         }
         {
             let mut when_first_row = builder.when_first_row();
-            let mut when_enabled = when_first_row.when(local.prefix.general.enabled);
+            let mut when_enabled = when_first_row
+                .when(local.prefix.general.is_ins_row + local.prefix.general.is_workload_row);
             // First row must be a workload row.
-            when_enabled.assert_zero(local.prefix.general.is_ins_row);
+            when_enabled.assert_one(local.prefix.general.is_workload_row);
             // Workload rows must start with the first element.
             when_enabled.assert_zero(local.prefix.data.idx);
             // local.result is all 0s.
@@ -292,18 +295,16 @@ impl FriReducedOpeningAir {
     ) {
         let local: &Instruction1Cols<AB::Var> = local_slice[..INS_1_WIDTH].borrow();
         let next: &Instruction2Cols<AB::Var> = next_slice[..INS_2_WIDTH].borrow();
+        // `is_ins_row` already indicates enabled.
         let mut is_ins_row = builder.when(local.prefix.general.is_ins_row);
         let mut is_first_ins = is_ins_row.when(local.prefix.a_or_is_first);
-        let mut next_enabled = is_first_ins.when(next.general.enabled);
-        // ATTENTION: degree of next_enabled is 3
-        next_enabled.assert_zero(next.is_first);
-        next_enabled.assert_one(next.general.is_ins_row);
+        // ATTENTION: degree of is_first_ins is 2
+        is_first_ins.assert_one(next.general.is_ins_row);
+        is_first_ins.assert_zero(next.is_first);
 
         let local_data = &local.prefix.data;
         let length = local.prefix.data.idx;
-        let multiplicity = local.prefix.general.enabled
-            * local.prefix.general.is_ins_row
-            * local.prefix.a_or_is_first;
+        let multiplicity = local.prefix.general.is_ins_row * local.prefix.a_or_is_first;
         let start_timestamp = local.prefix.general.timestamp;
         // 4 reads
         let write_timestamp =
@@ -381,28 +382,28 @@ impl FriReducedOpeningAir {
     ) {
         let local: &Instruction2Cols<AB::Var> = local_slice[..INS_2_WIDTH].borrow();
         let next: &WorkloadCols<AB::Var> = next_slice[..WL_WIDTH].borrow();
-
         {
             let mut last_row = builder.when_last_row();
-            let mut enabled = last_row.when(local.general.enabled);
+            let mut enabled =
+                last_row.when(local.general.is_ins_row + local.general.is_workload_row);
             // If the last row is enabled, it must be the second row of an instruction row. This
             // is a safeguard for edge cases.
             enabled.assert_one(local.general.is_ins_row);
-            enabled.assert_one(local.is_first);
+            enabled.assert_zero(local.is_first);
         }
         {
             let mut when_transition = builder.when_transition();
             let mut is_ins_row = when_transition.when(local.general.is_ins_row);
-            let mut not_first_row = is_ins_row.when_ne(local.is_first, AB::Expr::ONE);
-            // when_transition is necessary to check the next row is enabled.
-            let mut enabled = not_first_row.when(next.prefix.general.enabled);
+            let mut not_first_ins_row = is_ins_row.when_ne(local.is_first, AB::Expr::ONE);
+            // ATTENTION: degree of not_first_ins_row is 2
+            // Because all the followings assert 0, we don't need to check next.enabled.
             // The next row must be a workload row.
-            enabled.assert_zero(next.prefix.general.is_ins_row);
+            not_first_ins_row.assert_zero(next.prefix.general.is_ins_row);
             // The next row must have idx = 0.
-            enabled.assert_zero(next.prefix.data.idx);
+            not_first_ins_row.assert_zero(next.prefix.data.idx);
             // next.result is all 0s
             assert_array_eq(
-                &mut enabled,
+                &mut not_first_ins_row,
                 next.prefix.data.result,
                 [AB::Expr::ZERO; EXT_DEG],
             );
@@ -604,7 +605,7 @@ fn record_to_rows<F: PrimeField32>(
         *cols = WorkloadCols {
             prefix: PrefixCols {
                 general: GeneralCols {
-                    enabled: F::ONE,
+                    is_workload_row: F::ONE,
                     is_ins_row: F::ZERO,
                     timestamp: record.start_timestamp + F::from_canonical_usize((length - i) * 2),
                 },
@@ -635,7 +636,7 @@ fn record_to_rows<F: PrimeField32>(
         *cols = Instruction1Cols {
             prefix: PrefixCols {
                 general: GeneralCols {
-                    enabled: F::ONE,
+                    is_workload_row: F::ZERO,
                     is_ins_row: F::ONE,
                     timestamp: record.start_timestamp,
                 },
@@ -662,7 +663,7 @@ fn record_to_rows<F: PrimeField32>(
         let cols: &mut Instruction2Cols<F> = slice[start..start + INS_2_WIDTH].borrow_mut();
         *cols = Instruction2Cols {
             general: GeneralCols {
-                enabled: F::ONE,
+                is_workload_row: F::ZERO,
                 is_ins_row: F::ONE,
                 timestamp: record.start_timestamp,
             },

From ae71e736b1c3dc3029676e7a59f2618ae312ce63 Mon Sep 17 00:00:00 2001
From: Xinding Wei <weixinding@gmail.com>
Date: Thu, 23 Jan 2025 14:47:17 +0800
Subject: [PATCH 4/6] Revert degree changes

---
 crates/sdk/tests/integration_test.rs          |  4 +--
 crates/vm/src/arch/config.rs                  |  7 ++---
 crates/vm/tests/integration_test.rs           | 22 +++++++---------
 extensions/native/circuit/src/extension.rs    |  4 +--
 extensions/native/circuit/src/fri/tests.rs    | 26 +++----------------
 .../native/recursion/src/testing_utils.rs     |  4 +--
 6 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/crates/sdk/tests/integration_test.rs b/crates/sdk/tests/integration_test.rs
index ef4489a633..7fa7cd7686 100644
--- a/crates/sdk/tests/integration_test.rs
+++ b/crates/sdk/tests/integration_test.rs
@@ -307,7 +307,7 @@ fn test_static_verifier_custom_pv_handler() {
 
     // Test setup
     println!("test setup");
-    let app_log_blowup = 2;
+    let app_log_blowup = 1;
     let app_config = small_test_app_config(app_log_blowup);
     let app_pk = Sdk.app_keygen(app_config.clone()).unwrap();
     let app_committed_exe = app_committed_exe_for_test(app_log_blowup);
@@ -358,7 +358,7 @@ fn test_static_verifier_custom_pv_handler() {
 
 #[test]
 fn test_e2e_proof_generation_and_verification() {
-    let app_log_blowup = 2;
+    let app_log_blowup = 1;
     let app_config = small_test_app_config(app_log_blowup);
     let app_pk = Sdk.app_keygen(app_config).unwrap();
     let params_reader = CacheHalo2ParamsReader::new_with_default_params_dir();
diff --git a/crates/vm/src/arch/config.rs b/crates/vm/src/arch/config.rs
index bc690b3fb5..2aa24a89c4 100644
--- a/crates/vm/src/arch/config.rs
+++ b/crates/vm/src/arch/config.rs
@@ -19,8 +19,9 @@ use super::{
 use crate::system::memory::BOUNDARY_AIR_OFFSET;
 
 const DEFAULT_MAX_SEGMENT_LEN: usize = (1 << 22) - 100;
-// FRI Reduced Opening Chip has degree = 4.
-const DEFAULT_MAX_CONSTRAINT_DEGREE: usize = 4;
+// sbox is decomposed to have this max degree for Poseidon2. We set to 3 so quotient_degree = 2
+// allows log_blowup = 1
+const DEFAULT_POSEIDON2_MAX_CONSTRAINT_DEGREE: usize = 3;
 /// Width of Poseidon2 VM uses.
 pub const POSEIDON2_WIDTH: usize = 16;
 /// Returns a Poseidon2 config for the VM.
@@ -167,7 +168,7 @@ impl SystemConfig {
 impl Default for SystemConfig {
     fn default() -> Self {
         Self::new(
-            DEFAULT_MAX_CONSTRAINT_DEGREE,
+            DEFAULT_POSEIDON2_MAX_CONSTRAINT_DEGREE,
             Default::default(),
             DEFAULT_MAX_NUM_PUBLIC_VALUES,
         )
diff --git a/crates/vm/tests/integration_test.rs b/crates/vm/tests/integration_test.rs
index ca3f40c034..0468806ba2 100644
--- a/crates/vm/tests/integration_test.rs
+++ b/crates/vm/tests/integration_test.rs
@@ -124,7 +124,7 @@ fn test_vm_override_executor_height() {
     ));
 
     // Test getting heights.
-    let vm_config = NativeConfig::aggregation(8, 5);
+    let vm_config = NativeConfig::aggregation(8, 3);
 
     let executor = SingleSegmentVmExecutor::new(vm_config.clone());
     let res = executor
@@ -211,7 +211,7 @@ fn test_vm_override_executor_height() {
 fn test_vm_1_optional_air() {
     // Aggregation VmConfig has Core/Poseidon2/FieldArithmetic/FieldExtension chips. The program only
     // uses Core and FieldArithmetic. All other chips should not have AIR proof inputs.
-    let config = NativeConfig::aggregation(4, 5);
+    let config = NativeConfig::aggregation(4, 3);
     let engine =
         BabyBearPoseidon2Engine::new(standard_fri_params_with_100_bits_conjectured_security(3));
     let vm = VirtualMachine::new(engine, config);
@@ -319,7 +319,7 @@ fn test_vm_initial_memory() {
         .into_iter()
         .collect();
 
-    let config = NativeConfig::aggregation(0, 5).with_continuations();
+    let config = NativeConfig::aggregation(0, 3).with_continuations();
     let exe = VmExe {
         program,
         pc_start: 0,
@@ -331,11 +331,9 @@ fn test_vm_initial_memory() {
 
 #[test]
 fn test_vm_1_persistent() {
-    let engine = BabyBearPoseidon2Engine::new(
-        FriParameters::standard_with_100_bits_conjectured_security(2),
-    );
+    let engine = BabyBearPoseidon2Engine::new(FriParameters::standard_fast());
     let config = NativeConfig {
-        system: SystemConfig::new(5, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0),
+        system: SystemConfig::new(3, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0),
         native: Default::default(),
     }
     .with_continuations();
@@ -445,7 +443,7 @@ fn test_vm_continuations() {
     let n = 200000;
     let program = gen_continuation_test_program(n);
     let config = NativeConfig {
-        system: SystemConfig::new(5, MemoryConfig::default(), 0).with_max_segment_len(200000),
+        system: SystemConfig::new(3, MemoryConfig::default(), 0).with_max_segment_len(200000),
         native: Default::default(),
     }
     .with_continuations();
@@ -475,13 +473,11 @@ fn test_vm_continuations_recover_state() {
     let n = 2000;
     let program = gen_continuation_test_program(n);
     let config = NativeConfig {
-        system: SystemConfig::new(5, MemoryConfig::default(), 0).with_max_segment_len(500),
+        system: SystemConfig::new(3, MemoryConfig::default(), 0).with_max_segment_len(500),
         native: Default::default(),
     }
     .with_continuations();
-    let engine = BabyBearPoseidon2Engine::new(
-        FriParameters::standard_with_100_bits_conjectured_security(2),
-    );
+    let engine = BabyBearPoseidon2Engine::new(FriParameters::standard_fast());
     let vm = VirtualMachine::new(engine, config.clone());
     let pk = vm.keygen();
     let segments = vm
@@ -733,7 +729,7 @@ fn test_vm_field_extension_arithmetic_persistent() {
 
     let program = Program::from_instructions(&instructions);
     let config = NativeConfig {
-        system: SystemConfig::new(5, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0)
+        system: SystemConfig::new(3, MemoryConfig::new(1, 1, 16, 10, 6, 64, 1024), 0)
             .with_continuations(),
         native: Default::default(),
     };
diff --git a/extensions/native/circuit/src/extension.rs b/extensions/native/circuit/src/extension.rs
index fee30c886c..91a91cd8a3 100644
--- a/extensions/native/circuit/src/extension.rs
+++ b/extensions/native/circuit/src/extension.rs
@@ -58,10 +58,10 @@ impl NativeConfig {
         self
     }
 
-    pub fn aggregation(num_public_values: usize, max_constraint_degree: usize) -> Self {
+    pub fn aggregation(num_public_values: usize, poseidon2_max_constraint_degree: usize) -> Self {
         Self {
             system: SystemConfig::new(
-                max_constraint_degree,
+                poseidon2_max_constraint_degree,
                 MemoryConfig {
                     max_access_adapter_n: 8,
                     ..Default::default()
diff --git a/extensions/native/circuit/src/fri/tests.rs b/extensions/native/circuit/src/fri/tests.rs
index ae757d3e8c..41395855f8 100644
--- a/extensions/native/circuit/src/fri/tests.rs
+++ b/extensions/native/circuit/src/fri/tests.rs
@@ -7,12 +7,7 @@ use openvm_stark_backend::{
     utils::disable_debug_builder,
     verifier::VerificationError,
 };
-use openvm_stark_sdk::{
-    config::{baby_bear_poseidon2::BabyBearPoseidon2Engine, FriParameters},
-    engine::StarkFriEngine,
-    p3_baby_bear::BabyBear,
-    utils::create_seeded_rng,
-};
+use openvm_stark_sdk::{engine::StarkFriEngine, p3_baby_bear::BabyBear, utils::create_seeded_rng};
 use rand::Rng;
 
 use super::{super::field_extension::FieldExtension, elem_to_ext, FriReducedOpeningChip, EXT_DEG};
@@ -123,15 +118,8 @@ fn fri_mat_opening_air_test() {
         assert_eq!(result, tester.read(address_space, result_pointer));
     }
 
-    let mut tester = tester.build_babybear_poseidon2().load(chip).finalize();
-    // Degree needs >= 4
-    tester
-        .test::<BabyBearPoseidon2Engine, _>(|| {
-            BabyBearPoseidon2Engine::new(
-                FriParameters::standard_with_100_bits_conjectured_security(2),
-            )
-        })
-        .expect("Verification failed");
+    let mut tester = tester.build().load(chip).finalize();
+    tester.simple_test().expect("Verification failed");
 
     disable_debug_builder();
     // negative test pranking each value
@@ -148,13 +136,7 @@ fn fri_mat_opening_air_test() {
 
         // Run a test after pranking each row
         assert_eq!(
-            tester
-                .test::<BabyBearPoseidon2Engine, _>(|| {
-                    BabyBearPoseidon2Engine::new(
-                        FriParameters::standard_with_100_bits_conjectured_security(2),
-                    )
-                })
-                .err(),
+            tester.simple_test().err(),
             Some(VerificationError::OodEvaluationMismatch),
             "Expected constraint to fail"
         );
diff --git a/extensions/native/recursion/src/testing_utils.rs b/extensions/native/recursion/src/testing_utils.rs
index 4d6e484304..bbdbde05a4 100644
--- a/extensions/native/recursion/src/testing_utils.rs
+++ b/extensions/native/recursion/src/testing_utils.rs
@@ -66,9 +66,7 @@ pub mod inner {
         fri_params: FriParameters,
     ) {
         let vparams = test_proof_input
-            .run_test(&BabyBearPoseidon2Engine::new(
-                FriParameters::standard_with_100_bits_conjectured_security(2),
-            ))
+            .run_test(&BabyBearPoseidon2Engine::new(FriParameters::standard_fast()))
             .unwrap();
 
         recursive_stark_test(

From d7e4e5eeed51bb68225445a9fe024c29652a5cb4 Mon Sep 17 00:00:00 2001
From: Xinding Wei <weixinding@gmail.com>
Date: Thu, 23 Jan 2025 14:52:02 +0800
Subject: [PATCH 5/6] Fix lint

---
 extensions/native/circuit/src/fri/tests.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/native/circuit/src/fri/tests.rs b/extensions/native/circuit/src/fri/tests.rs
index 41395855f8..eef99be6ce 100644
--- a/extensions/native/circuit/src/fri/tests.rs
+++ b/extensions/native/circuit/src/fri/tests.rs
@@ -7,7 +7,7 @@ use openvm_stark_backend::{
     utils::disable_debug_builder,
     verifier::VerificationError,
 };
-use openvm_stark_sdk::{engine::StarkFriEngine, p3_baby_bear::BabyBear, utils::create_seeded_rng};
+use openvm_stark_sdk::{p3_baby_bear::BabyBear, utils::create_seeded_rng};
 use rand::Rng;
 
 use super::{super::field_extension::FieldExtension, elem_to_ext, FriReducedOpeningChip, EXT_DEG};

From 0ff18c5666d1b2e95a2f5d3e34b493bc5b325c00 Mon Sep 17 00:00:00 2001
From: Xinding Wei <weixinding@gmail.com>
Date: Thu, 23 Jan 2025 15:29:46 +0800
Subject: [PATCH 6/6] Update ISA docs

---
 docs/specs/ISA.md | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/docs/specs/ISA.md b/docs/specs/ISA.md
index 67128173a1..697ad53997 100644
--- a/docs/specs/ISA.md
+++ b/docs/specs/ISA.md
@@ -572,14 +572,12 @@ size `CHUNK`, depending on the output size of the corresponding opcode.
 
 ### Verification
 
-We have the following special opcode tailored to optimize verification. Due to already having a large number of
-operands,
-the address space is fixed to be `AS::Native = 5`.
-
-| Name                                                                                                                                                                                                                         | Operands        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| **VERIFY_BATCH** `[CHUNK, PID]` <br/><br/> Here `CHUNK` and `PID` are **constants** that determine different opcodes. `PID` is an internal identifier for particular Poseidon2 constants dependent on the field (see below). | `a,b,c,d,e,f,g` | Computes `mmcs::verify_batch`. In the native address space, `[a], [b], [d], [e], [f]` should be the array start pointers for the dimensions array, the opened values array (which contains more arrays), the proof (which contains arrays of length `CHUNK`) and the commitment (which is an array of length `CHUNK`). `[c]` should be the length of the opened values array (and so should be equal to the length of the dimensions array as well). `g` should be the reciprocal of the size (in field elements) of the values contained in the opened values array: if the opened values array contains field elements, `g` should be 1; if the opened values array contains extension field elements, `g` should be 1/4. |
-| **FRI_REDUCED_OPENING**                                                                                                                                                                                                      | `a,b,c,d,e,f`   | Let `length = [e]_d`, `a_ptr = [a]_d`, `b_ptr = [b]_d`, `alpha = [f:EXT_DEG]_d`. `a_ptr` is the address of Felt array `a_arr` and `b_ptr` is the address of Ext array `b_arr`. Compute `sum((b_arr[i] - a_arr[i]) * alpha ^ i)` for `i=0..length` and write the value into `[c:EXT_DEG]_d`.                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+We have the following special opcode tailored to optimize verification.
+
+| Name                                                                                                                                                                                                                         | Operands        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **VERIFY_BATCH** `[CHUNK, PID]` <br/><br/> Here `CHUNK` and `PID` are **constants** that determine different opcodes. `PID` is an internal identifier for particular Poseidon2 constants dependent on the field (see below). | `a,b,c,d,e,f,g` | Due to already having a large number of operands,  the address space is fixed to be `AS::Native = 5`. Computes `mmcs::verify_batch`. In the native address space, `[a], [b], [d], [e], [f]` should be the array start pointers for the dimensions array, the opened values array (which contains more arrays), the proof (which contains arrays of length `CHUNK`) and the commitment (which is an array of length `CHUNK`). `[c]` should be the length of the opened values array (and so should be equal to the length of the dimensions array as well). `g` should be the reciprocal of the size (in field elements) of the values contained in the opened values array: if the opened values array contains field elements, `g` should be 1; if the opened values array contains extension field elements, `g` should be 1/4. |
+| **FRI_REDUCED_OPENING**                                                                                                                                                                                                      | `a,b,c,d,e,f`   | Let `length = [e]_d`, `a_ptr = [a]_d`, `b_ptr = [b]_d`, `alpha = [f:EXT_DEG]_d`. `a_ptr` is the address of Felt array `a_arr` and `b_ptr` is the address of Ext array `b_arr`. Compute `sum((b_arr[i] - a_arr[i]) * alpha ^ i)` for `i=0..length` and write the value into `[c:EXT_DEG]_d`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 
 ## Phantom Sub-Instructions