diff --git a/.pins/torchao-pin.txt b/.pins/torchao-pin.txt
new file mode 100644
index 000000000..a3402d40c
--- /dev/null
+++ b/.pins/torchao-pin.txt
@@ -0,0 +1 @@
+85d03de43160328eaf350e7ec3877d3d7b57da50
diff --git a/generate.py b/generate.py
index fc48375d2..57004b057 100644
--- a/generate.py
+++ b/generate.py
@@ -395,6 +395,13 @@ def decode_n_tokens(
                     )
                     input_pos += 1
                     break
+            if _i == 1:
+                t0 = time.time()
+            if _i == num_new_tokens - 2:
+                t1 = time.time()
+                print(f"\nTime to generate {num_new_tokens-2} tokens: {t1-t0}")
+                print(f"\nTokens/sec to generate {num_new_tokens-2} tokens: {(num_new_tokens-2) / (t1-t0)}")
+
 
         if not encountered_eos:
             eos_token = torch.tensor(
diff --git a/quantization/quantize.py b/quantization/quantize.py
index 8efc4fa08..a1232327e 100644
--- a/quantization/quantize.py
+++ b/quantization/quantize.py
@@ -92,9 +92,21 @@ def quantize_model(
 
             try:
                 # Easier to ask forgiveness than permission
-                quant_handler = ao_quantizer_class_dict[quantizer](
-                    groupsize=q_kwargs["groupsize"], device=device, precision=precision
-                )
+                if quantizer == "linear:a8wlow":
+                    quant_handler = ao_quantizer_class_dict[quantizer](
+                        device=device,
+                        precision=precision,
+                        bitwidth=q_kwargs.get("bitwidth", 4),
+                        groupsize=q_kwargs.get("groupsize", 128),
+                        has_weight_zeros=q_kwargs.get("has_weight_zeros", False),
+                        squeeze_unsqueeze_dim0=True,
+                    )
+                else:
+                    quant_handler = ao_quantizer_class_dict[quantizer](
+                        groupsize=q_kwargs["groupsize"],
+                        device=device,
+                        precision=precision,
+                    )
             except TypeError as e:
                 if "unexpected keyword argument 'device'" in str(e):
                     quant_handler = ao_quantizer_class_dict[quantizer](
@@ -581,3 +593,33 @@ def quantized_model(self) -> nn.Module:
     "linear:int4": Int4WeightOnlyQuantizer,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
+
+try:
+    import importlib.util
+    import sys
+    import os
+    torchao_build_path = f"{os.getcwd()}/torchao-build"
+
+    # Load quantizer
+    torchao_experimental_spec = importlib.util.spec_from_file_location(
+        "torchao_experimental",
+        f"{torchao_build_path}/src/ao/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op/torch_custom_op.py",
+    )
+    torchao_experimental = importlib.util.module_from_spec(torchao_experimental_spec)
+    sys.modules["torchao_experimental"] = torchao_experimental
+    torchao_experimental_spec.loader.exec_module(torchao_experimental)
+    from torchao_experimental import Int8DynActLowbitWeightQuantizer
+    ao_quantizer_class_dict["linear:a8wlow"] = Int8DynActLowbitWeightQuantizer
+
+    # Try loading custom op
+    try:
+        import glob
+        libs = glob.glob(f"{torchao_build_path}/cmake-out/liblowbit_op_aten.*")
+        libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
+        torch.ops.load_library(libs[0])
+    except Exception as e:
+        print("Failed to load custom ops : ", e)
+        print("Slow fallback kernels will be used.")
+
+except Exception as e:
+    print(f"Failed to use torchao_experimental kernels: {e}")
diff --git a/runner/aoti.cmake b/runner/aoti.cmake
index 156e9bcce..24ec6e505 100644
--- a/runner/aoti.cmake
+++ b/runner/aoti.cmake
@@ -28,3 +28,6 @@ if(Torch_FOUND)
     target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
     set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
 endif()
+
+
+target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/liblowbit_op_aten${CMAKE_SHARED_LIBRARY_SUFFIX}")
diff --git a/runner/et.cmake b/runner/et.cmake
index 7fc16b1f2..6eeb8f59b 100644
--- a/runner/et.cmake
+++ b/runner/et.cmake
@@ -129,3 +129,5 @@ if(executorch_FOUND)
 else()
   MESSAGE(WARNING "ExecuTorch package not found")
 endif()
+
+target_link_libraries(et_run PRIVATE "${TORCHCHAT_ROOT}/torchao-build/cmake-out/liblowbit_op_executorch${CMAKE_SHARED_LIBRARY_SUFFIX}")
diff --git a/scripts/build_native.sh b/scripts/build_native.sh
index 6ceea0aee..7e7574aed 100755
--- a/scripts/build_native.sh
+++ b/scripts/build_native.sh
@@ -60,6 +60,10 @@ if [ -z "${ET_BUILD_DIR}" ]; then
     ET_BUILD_DIR="et-build"
 fi
 
+if [ -z "${TORCHAO_BUILD_DIR}" ]; then
+    TORCHAO_BUILD_DIR="torchao-build"
+fi
+
 source "$TORCHCHAT_ROOT/scripts/install_utils.sh"
 
 pushd ${TORCHCHAT_ROOT}
@@ -70,6 +74,10 @@ if [[ "$TARGET" == "et" ]]; then
     install_pip_dependencies
     clone_executorch
     install_executorch_libs false
+
+    EXECUTORCH_INCLUDE_DIRS=${TORCHCHAT_ROOT}/et-build/src
+    EXECUTORCH_LIBRARIES=${TORCHCHAT_ROOT}/et-build/install/lib/libexecutorch_no_prim_ops.a
+    install_torchao_custom_executorch_ops
 fi
 popd
 
diff --git a/scripts/build_torchao_custom_ops.sh b/scripts/build_torchao_custom_ops.sh
new file mode 100644
index 000000000..cf0626afb
--- /dev/null
+++ b/scripts/build_torchao_custom_ops.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+if [ -z "${TORCHCHAT_ROOT}" ]; then
+    # Get the absolute path of the current script
+    SCRIPT_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+    # Get the absolute path of the parent directory
+    TORCHCHAT_ROOT="$(dirname "$SCRIPT_PATH")"
+fi
+
+if [ -z "${TORCHAO_BUILD_DIR}" ]; then
+    TORCHAO_BUILD_DIR="torchao-build"
+fi
+
+source "$TORCHCHAT_ROOT/scripts/install_utils.sh"
+
+find_cmake_prefix_path
+clone_torchao
+install_torchao_custom_aten_ops
diff --git a/scripts/install_utils.sh b/scripts/install_utils.sh
index 3b3ad4926..a7badf720 100644
--- a/scripts/install_utils.sh
+++ b/scripts/install_utils.sh
@@ -124,3 +124,54 @@ install_executorch_libs() {
 
   install_executorch_python_libs $1
 }
+
+clone_torchao() {
+  echo "Cloning torchao to ${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src"
+  rm -rf ${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src
+  mkdir -p ${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src
+  pushd ${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src
+  echo $pwd
+
+  cp -R /Users/scroy/fbsource/fbcode/pytorch/ao .
+  # git clone https://github.com/pytorch/ao.git
+  # cd ao
+  # git checkout $(cat ${TORCHCHAT_ROOT}/.pins/torchao-pin.txt)
+
+  popd
+}
+
+install_torchao_custom_aten_ops() {
+  echo "Installing custom torchao ops"
+  pushd ${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src/ao/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op
+  export TORCHAO_INCLUDE_DIRS=${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src/ao
+
+  if [ "${CMAKE_OUT_DIR}" == "" ]; then
+    CMAKE_OUT_DIR="${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/cmake-out"
+  fi
+
+  cmake -DTORCHAO_INCLUDE_DIRS=${TORCHAO_INCLUDE_DIRS} \
+    -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
+    -DPLATFORM="ATEN" \
+    -S . \
+    -B ${CMAKE_OUT_DIR} -G Ninja
+  cmake --build  ${CMAKE_OUT_DIR}
+}
+
+install_torchao_custom_executorch_ops() {
+  echo "Installing custom torchao ops"
+  pushd ${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src/ao/torchao/experimental/kernels/cpu/linear/examples/torch_custom_op
+  export TORCHAO_INCLUDE_DIRS=${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/src/ao
+
+  if [ "${CMAKE_OUT_DIR}" == "" ]; then
+    CMAKE_OUT_DIR="${TORCHCHAT_ROOT}/${TORCHAO_BUILD_DIR}/cmake-out"
+  fi
+
+  cmake -DTORCHAO_INCLUDE_DIRS=${TORCHAO_INCLUDE_DIRS} \
+    -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
+    -DEXECUTORCH_INCLUDE_DIRS=${EXECUTORCH_INCLUDE_DIRS} \
+    -DEXECUTORCH_LIBRARIES=${EXECUTORCH_LIBRARIES} \
+    -DPLATFORM="EXECUTORCH" \
+    -S . \
+    -B ${CMAKE_OUT_DIR} -G Ninja
+  cmake --build  ${CMAKE_OUT_DIR}
+}