diff --git a/graph_net/test/ast_graph_variable_rename_test.sh b/graph_net/test/ast_graph_variable_rename_test.sh
index 08990a5ae..5667837b1 100755
--- a/graph_net/test/ast_graph_variable_rename_test.sh
+++ b/graph_net/test/ast_graph_variable_rename_test.sh
@@ -28,7 +28,7 @@ EOF
 ) \
     2>&1 | tee "$RENAMED_PATH/graph_rename.log"
 
-python3 -m graph_net.torch.test_compiler \
+python3 -m graph_net_bench.torch.test_compiler \
     --model-path-prefix $GRAPH_NET_ROOT \
     --allow-list $model_list \
     --compiler graph_variable_renamer_validator \
diff --git a/graph_net/test/fx_graph_module_unserialize_test.sh b/graph_net/test/fx_graph_module_unserialize_test.sh
index 41a21aa92..171f3c625 100755
--- a/graph_net/test/fx_graph_module_unserialize_test.sh
+++ b/graph_net/test/fx_graph_module_unserialize_test.sh
@@ -55,7 +55,7 @@ EOF
 )
 
 
-python3 -m graph_net.torch.test_compiler \
+python3 -m graph_net_bench.torch.test_compiler \
     --model-path-prefix $GRAPH_NET_ROOT \
     --allow-list $model_list \
     --compiler range_decomposer_validator \
diff --git a/graph_net/test/graph_variable_rename_test.sh b/graph_net/test/graph_variable_rename_test.sh
index b58f34b88..1ceaa33bc 100755
--- a/graph_net/test/graph_variable_rename_test.sh
+++ b/graph_net/test/graph_variable_rename_test.sh
@@ -28,7 +28,7 @@ EOF
 ) \
     2>&1 | tee "$RENAMED_PATH/graph_rename.log"
 
-python3 -m graph_net.torch.test_compiler \
+python3 -m graph_net_bench.torch.test_compiler \
     --model-path-prefix $GRAPH_NET_ROOT \
     --allow-list $model_list \
     --compiler graph_variable_renamer_validator \
diff --git a/graph_net/test/typical_sequence_decomposer_test.sh b/graph_net/test/typical_sequence_decomposer_test.sh
index 59810756e..5f9b8c4a1 100755
--- a/graph_net/test/typical_sequence_decomposer_test.sh
+++ b/graph_net/test/typical_sequence_decomposer_test.sh
@@ -82,7 +82,7 @@ EOF
 )
 
 
-python3 -m graph_net.torch.test_compiler \
+python3 -m graph_net_bench.torch.test_compiler \
     --model-path-prefix $GRAPH_NET_ROOT \
     --allow-list $model_list \
     --compiler range_decomposer_validator \
diff --git a/graph_net/torch/backend/graph_compiler_backend.py b/graph_net/torch/backend/graph_compiler_backend.py
deleted file mode 100644
index 44fb9cc44..000000000
--- a/graph_net/torch/backend/graph_compiler_backend.py
+++ /dev/null
@@ -1,6 +0,0 @@
-class GraphCompilerBackend:
-    def __call__(self, model):
-        raise NotImplementedError()
-
-    def synchronize(self):
-        raise NotImplementedError()
diff --git a/graph_net/torch/backend/nope_backend.py b/graph_net/torch/backend/nope_backend.py
deleted file mode 100644
index 688fd8e12..000000000
--- a/graph_net/torch/backend/nope_backend.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-from .graph_compiler_backend import GraphCompilerBackend
-
-
-class NopeBackend(GraphCompilerBackend):
-    def __call__(self, model):
-        return model
-
-    def synchronize(self):
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
diff --git a/graph_net/torch/fx_graph_module_util.py b/graph_net/torch/fx_graph_module_util.py
index 3ad882f22..22baa34e1 100644
--- a/graph_net/torch/fx_graph_module_util.py
+++ b/graph_net/torch/fx_graph_module_util.py
@@ -1,5 +1,6 @@
 import os
 import inspect
+from graph_net.torch.utils import get_named_tensors
 from graph_net.tensor_meta import TensorMeta
 from graph_net.imp_util import load_module
 from dataclasses import asdict
@@ -38,8 +39,6 @@ def _get_tensor_metas(model_path):
 
 def _create_inputs_by_metas(module, tensor_metas, use_dummy_inputs):
     tensor_meta_attrs_list = [asdict(tensor_meta) for tensor_meta in tensor_metas]
-    from graph_net.torch.utils import get_named_tensors
-
     named_tensors = get_named_tensors(tensor_meta_attrs_list, use_dummy_inputs)
     name2tensor = {k: v for k, v in named_tensors}
     return tuple(
diff --git a/graph_net/torch/static_to_dynamic.py b/graph_net/torch/static_to_dynamic.py
index 54a4de3b7..eb4822a72 100644
--- a/graph_net/torch/static_to_dynamic.py
+++ b/graph_net/torch/static_to_dynamic.py
@@ -1,9 +1,11 @@
 import traceback
 import logging
 import torch
-from graph_net.torch.utils import get_named_tensors
+from graph_net.torch.utils import (
+    get_named_tensors,
+    apply_templates,
+)
 from torch.fx.passes.shape_prop import ShapeProp
-from graph_net.torch.utils import apply_templates
 from pathlib import Path
 import inspect
 from graph_net.torch.fx_graph_parse_util import parse_sole_graph_module
diff --git a/graph_net/torch/test_compiler.py b/graph_net/torch/test_compiler.py
deleted file mode 100755
index 5be2f5b61..000000000
--- a/graph_net/torch/test_compiler.py
+++ /dev/null
@@ -1,542 +0,0 @@
-from . import utils
-import subprocess
-import argparse
-import importlib.util
-import torch
-from pathlib import Path
-from typing import Type
-import sys
-import os
-import os.path
-import traceback
-import json
-import random
-import numpy as np
-import platform
-import base64
-from graph_net.torch.backend.graph_compiler_backend import GraphCompilerBackend
-from graph_net.torch.backend.tvm_backend import TvmBackend
-from graph_net.torch.backend.xla_backend import XlaBackend
-from graph_net.torch.backend.inductor_backend import InductorBackend
-from graph_net.torch.backend.tensorrt_backend import TensorRTBackend
-from graph_net.torch.backend.blade_disc_backend import BladeDISCBackend
-from graph_net.torch.backend.nope_backend import NopeBackend
-from graph_net.torch.backend.unstable_to_stable_backend import UnstableToStableBackend
-from graph_net.torch.backend.range_decomposer_validator_backend import (
-    RangeDecomposerValidatorBackend,
-)
-from graph_net.torch.backend.graph_variable_renamer_validator_backend import (
-    GraphVariableRenamerValidatorBackend,
-)
-from graph_net_bench import test_compiler_util
-from graph_net import model_path_util
-from graph_net_bench import path_utils
-
-
-registry_backend = {
-    "tvm": TvmBackend(),
-    "xla": XlaBackend(),
-    "inductor": InductorBackend(),
-    "tensorrt": TensorRTBackend(),
-    "bladedisc": BladeDISCBackend(),
-    "nope": NopeBackend(),
-    "unstable_to_stable": UnstableToStableBackend(),
-    "range_decomposer_validator": RangeDecomposerValidatorBackend(),
-    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend(),
-}
-
-
-def set_seed(random_seed):
-    random.seed(random_seed)
-    np.random.seed(random_seed)
-    torch.manual_seed(random_seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_seed)
-        torch.cuda.manual_seed_all(random_seed)
-
-
-def get_hardward_name(args):
-    hardware_name = "unknown"
-    if "cuda" in args.device:
-        hardware_name = torch.cuda.get_device_name(args.device)
-    elif args.device == "cpu":
-        hardware_name = platform.processor()
-    return hardware_name
-
-
-def get_compile_framework_version(args):
-    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
-        return torch.__version__
-    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        return f"{args.compiler.capitalize()} {args.compiler.version}"
-    return "unknown"
-
-
-def load_class_from_file(
-    args: argparse.Namespace, class_name: str, device: str
-) -> Type[torch.nn.Module]:
-    file_path = f"{args.model_path}/model.py"
-    file = Path(file_path).resolve()
-    module_name = file.stem
-
-    with open(file_path, "r", encoding="utf-8") as f:
-        model_code = f.read()
-    model_code = utils.modify_code_by_device(model_code, device)
-    spec = importlib.util.spec_from_loader(module_name, loader=None)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    compiled_code = compile(model_code, filename=file, mode="exec")
-    exec(compiled_code, module.__dict__)
-
-    model_class = getattr(module, class_name, None)
-    setattr(model_class, "__graph_net_file_path__", file_path)
-    setattr(model_class, "__graph_net_device__", device)
-    return model_class
-
-
-def convert_to_dict(config_str):
-    if config_str is None or config_str == "None":
-        return {}
-    config_str = base64.b64decode(config_str).decode("utf-8")
-    config = json.loads(config_str)
-    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
-    return config
-
-
-def get_compiler_backend(args) -> GraphCompilerBackend:
-    assert args.compiler in registry_backend, f"Unknown compiler: {args.compiler}"
-    backend = registry_backend[args.compiler]
-    if args.config is not None:
-        backend.config = convert_to_dict(args.config)
-    return backend
-
-
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
-
-    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
-    model = model_class().to(torch.device(args.device))
-    return model
-
-
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
-    params = inputs_params["weight_info"]
-    for tensor_meta in params.values():
-        if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
-    return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
-        for k, v in params.items()
-    }
-
-
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
-
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
-
-    hardware_name = get_hardward_name(args)
-    print(
-        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    if "cuda" in args.device:
-        """
-        Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
-        With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
-        """
-
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-
-    return outs, stats
-
-
-def test_single_model(args):
-    compiler = get_compiler_backend(args)
-    input_dict = get_input_dict(args)
-    model = get_model(args)
-    model_path = os.path.normpath(args.model_path)
-    test_compiler_util.print_with_log_prompt(
-        "[Processing]", model_path, args.log_prompt
-    )
-    test_compiler_util.print_basic_config(
-        args, get_hardward_name(args), get_compile_framework_version(args)
-    )
-
-    runtime_seed = 1024
-    eager_failure = False
-    expected_out = None
-    eager_time_stats = {}
-
-    try:
-
-        def eager_model_call():
-            return model(**input_dict)
-
-        expected_out, eager_time_stats = measure_performance(
-            eager_model_call, args, compiler
-        )
-
-        torch.manual_seed(runtime_seed)
-        if not isinstance(expected_out, tuple):
-            expected_out = (expected_out,)
-    except (TypeError, RuntimeError) as e:
-        print(f"Eager model execution failed: {str(e)}", file=sys.stderr)
-        eager_failure = True
-
-    compiled_failure = False
-    compiled_model = None
-    compiled_time_stats = {}
-
-    try:
-        compiled_model = compiler(model)
-        torch.manual_seed(runtime_seed)
-
-        def compiled_model_call():
-            return compiled_model(**input_dict)
-
-        compiled_out, compiled_time_stats = measure_performance(
-            compiled_model_call, args, compiler
-        )
-
-        if not isinstance(compiled_out, tuple):
-            compiled_out = (compiled_out,)
-        if args.compiler == "xla":
-            compiled_out = tuple(item.to("cpu").to("cuda") for item in compiled_out)
-    except (TypeError, RuntimeError) as e:
-        print(f"Compiled model execution failed: {str(e)}", file=sys.stderr)
-        compiled_failure = True
-        print("\n--- Full Traceback ---")
-        traceback.print_exc()
-        print(f"debug-model-execution {type(e).__name__} {args.model_path}", flush=True)
-    except Exception as e:
-        compiled_failure = True
-        print("\n--- Full Traceback ---")
-        traceback.print_exc()
-        print(f"debug-model-execution {type(e).__name__} {args.model_path}", flush=True)
-
-    if eager_failure:
-        print(f"{args.log_prompt} [Result] status: failed", file=sys.stderr, flush=True)
-        print(
-            f"{args.log_prompt} [Fail due to eager model execution error.]",
-            file=sys.stderr,
-            flush=True,
-        )
-    elif compiled_failure:
-        print(f"{args.log_prompt} [Result] status: failed", file=sys.stderr, flush=True)
-        print(
-            f"{args.log_prompt} [Fail due to compiled model execution error.]",
-            file=sys.stderr,
-            flush=True,
-        )
-    else:
-        compare_correctness(expected_out, compiled_out, args)
-
-        print(
-            f"{args.log_prompt} [Result] status: success", file=sys.stderr, flush=True
-        )
-
-        test_compiler_util.print_times_and_speedup(
-            args, eager_time_stats, compiled_time_stats
-        )
-
-
-def print_and_store_cmp(key, cmp_func, args, expected_out, compiled_out, **kwargs):
-    cmp_ret = cmp_func(expected_out, compiled_out, **kwargs)
-    print(
-        f"{args.log_prompt} [Correctness]{key}: {cmp_ret}",
-        file=sys.stderr,
-        flush=True,
-    )
-    return cmp_ret
-
-
-def compare_correctness(expected_out, compiled_out, args):
-    eager_dtypes = [
-        (
-            str(x.dtype).replace("torch.", "")
-            if isinstance(x, torch.Tensor)
-            else type(x).__name__
-        )
-        for x in expected_out
-    ]
-    compiled_dtypes = [
-        (
-            str(x.dtype).replace("torch.", "")
-            if isinstance(x, torch.Tensor)
-            else type(x).__name__
-        )
-        for x in compiled_out
-    ]
-
-    # datatype check
-    type_match = test_compiler_util.check_output_datatype(
-        args, eager_dtypes, compiled_dtypes
-    )
-
-    if type_match:
-        test_compiler_util.check_equal(
-            args,
-            expected_out,
-            compiled_out,
-            cmp_equal_func=get_cmp_equal,
-        )
-
-        test_compiler_util.check_allclose(
-            args,
-            expected_out,
-            compiled_out,
-            cmp_all_close_func=get_cmp_all_close,
-            cmp_max_diff_func=get_cmp_max_diff,
-            cmp_mean_diff_func=get_cmp_mean_diff,
-        )
-
-
-def get_cmp_equal(expected_out, compiled_out):
-    return " ".join(
-        str(int(torch.equal(a, b))) for a, b in zip(expected_out, compiled_out)
-    )
-
-
-def get_cmp_all_close(expected_out, compiled_out, atol, rtol):
-    return " ".join(
-        str(int(torch.allclose(a, b, atol=atol, rtol=rtol)))
-        for a, b in zip(expected_out, compiled_out)
-    )
-
-
-def get_cmp_max_diff(expected_out, compiled_out):
-    return " ".join(
-        # Transform to float to handle LongTensor output of some models, which cannnot be processed with torch.max().
-        str(torch.max(torch.abs(a.float() - b.float())).item())
-        for a, b in zip(expected_out, compiled_out)
-    )
-
-
-def get_cmp_mean_diff(expected_out, compiled_out):
-    return " ".join(
-        # To handle LongTensor
-        str(torch.mean(torch.abs(a.float() - b.float())).item())
-        for a, b in zip(expected_out, compiled_out)
-    )
-
-
-def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
-    results = []
-    for a, b in zip(expected_out, compiled_out):
-        # To handle LongTensor
-        if a.is_floating_point() and b.is_floating_point():
-            diff_count = torch.sum(~torch.isclose(a, b, atol=atol, rtol=rtol)).item()
-        else:
-            diff_count = torch.sum(a != b).item()
-        results.append(str(diff_count))
-    return " ".join(results)
-
-
-def test_multi_models(args):
-    test_samples = model_path_util.get_allow_samples(args.allow_list)
-
-    sample_idx = 0
-    failed_samples = []
-    module_name = os.path.splitext(os.path.basename(__file__))[0]
-    for model_path in path_utils.get_recursively_model_path(args.model_path):
-        if test_samples is None or os.path.abspath(model_path) in test_samples:
-            print(
-                f"[{sample_idx}] {module_name}, model_path: {model_path}",
-                file=sys.stderr,
-                flush=True,
-            )
-            cmd = " ".join(
-                [
-                    sys.executable,
-                    f"-m graph_net.torch.{module_name}",
-                    f"--model-path {model_path}",
-                    f"--compiler {args.compiler}",
-                    f"--device {args.device}",
-                    f"--warmup {args.warmup}",
-                    f"--trials {args.trials}",
-                    f"--log-prompt {args.log_prompt}",
-                    f"--config {args.config}",
-                ]
-            )
-            try:
-                process = subprocess.Popen(cmd, shell=True)
-                cmd_ret = process.wait()
-            except KeyboardInterrupt:
-                print("KeyboardInterrupt")
-                sys.exit(1)
-            except Exception:
-                print("\n--- Full Traceback ---")
-                traceback.print_exc()
-            if cmd_ret != 0:
-                failed_samples.append(model_path)
-            sample_idx += 1
-
-    print(
-        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
-        file=sys.stderr,
-        flush=True,
-    )
-    for model_path in failed_samples:
-        print(f"- {model_path}", file=sys.stderr, flush=True)
-
-
-def test_multi_models_with_prefix(args):
-    assert os.path.isdir(args.model_path_prefix)
-    assert os.path.isfile(args.allow_list)
-    test_samples = model_path_util.get_allow_samples(args.allow_list)
-    py_module_name = os.path.splitext(os.path.basename(__file__))[0]
-    for rel_model_path in test_samples:
-        model_path = os.path.join(args.model_path_prefix, rel_model_path)
-        if not os.path.exists(model_path):
-            continue
-        if not os.path.exists(os.path.join(model_path, "model.py")):
-            continue
-        cmd = " ".join(
-            [
-                sys.executable,
-                f"-m graph_net.torch.{py_module_name}",
-                f"--model-path {model_path}",
-                f"--compiler {args.compiler}",
-                f"--device {args.device}",
-                f"--warmup {args.warmup}",
-                f"--trials {args.trials}",
-                f"--log-prompt {args.log_prompt}",
-                f"--config {args.config}",
-            ]
-        )
-        try:
-            process = subprocess.Popen(cmd, shell=True)
-            process.wait()
-        except KeyboardInterrupt:
-            print("KeyboardInterrupt")
-            sys.exit(1)
-        except Exception:
-            print("\n--- Full Traceback ---")
-            traceback.print_exc()
-
-
-def main(args):
-    if args.model_path_prefix is not None:
-        test_multi_models_with_prefix(args)
-        return
-    assert os.path.isdir(args.model_path)
-
-    initalize_seed = 123
-    set_seed(random_seed=initalize_seed)
-
-    if path_utils.is_single_model_dir(args.model_path):
-        test_single_model(args)
-    else:
-        test_multi_models(args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test compiler performance.")
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model",
-    )
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        required=False,
-        default="inductor",
-        help="Path to customized compiler python file",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        required=False,
-        default="cuda",
-        help="Device for testing the compiler (e.g., 'cpu' or 'cuda')",
-    )
-    parser.add_argument(
-        "--warmup", type=int, required=False, default=3, help="Number of warmup steps"
-    )
-    parser.add_argument(
-        "--trials", type=int, required=False, default=5, help="Number of timing trials"
-    )
-    parser.add_argument(
-        "--log-prompt",
-        type=str,
-        required=False,
-        default="graph-net-test-compiler-log",
-        help="Log prompt for performance log filtering.",
-    )
-    parser.add_argument(
-        "--allow-list",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to samples list, each line contains a sample path",
-    )
-    parser.add_argument(
-        "--model-path-prefix",
-        type=str,
-        required=False,
-        default=None,
-        help="Prefix path to model path list",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        required=False,
-        default=None,
-        help="base64 encode configuration json.",
-    )
-    args = parser.parse_args()
-    main(args=args)
diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index a6fe7e9e0..f022d2ba5 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -10,7 +10,7 @@
 from graph_net_bench import path_utils
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net.torch import test_compiler
+from graph_net_bench.torch import test_compiler
 
 
 def get_reference_log_path(reference_dir, model_path):
diff --git a/graph_net/torch/test_target_device.py b/graph_net/torch/test_target_device.py
index 3e0a09daa..ec2085a32 100644
--- a/graph_net/torch/test_target_device.py
+++ b/graph_net/torch/test_target_device.py
@@ -8,7 +8,7 @@
 from graph_net_bench import path_utils
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net.torch import test_compiler, test_reference_device
+from graph_net_bench.torch import test_compiler, test_reference_device
 
 
 def parse_config_from_reference_log(log_path):
diff --git a/graph_net/torch/utils.py b/graph_net/torch/utils.py
old mode 100755
new mode 100644
index 62837fc8c..b397a9b6d
--- a/graph_net/torch/utils.py
+++ b/graph_net/torch/utils.py
@@ -221,21 +221,6 @@ def load_converted_from_text(file_path):
     }
 
 
-def convert_tensor_meta_attrs_list_to_named_tensors(tensor_meta_attrs_list):
-    tensors_wrappers = convert_tensor_meta_attrs_list_to_tensors_wrappers(
-        tensor_meta_attrs_list
-    )
-    ret = []
-    for i, tensors_wrapper in enumerate(tensors_wrappers):
-        name = tensors_wrapper["name"]
-        # shape = tensors_wrapper["info"]['shape']
-        # logging.warning(f"before replay_tensor {i=} {shape=}")
-        tensor = replay_tensor(tensors_wrapper)
-        # logging.warning(f"after replay_tensor {i=} {shape=}")
-        ret.append((name, tensor))
-    return ret
-
-
 def get_named_tensors(tensor_meta_attrs_list, use_dummy_inputs):
     tensors_wrappers = convert_tensor_meta_attrs_list_to_tensors_wrappers(
         tensor_meta_attrs_list
@@ -324,10 +309,6 @@ def _get_classes(file_path):
     yield from inspect.getmembers(unnamed, inspect.isclass)
 
 
-def extract_dynamic_shapes(example_inputs):
-    pass
-
-
 def replay_tensor(info):
     device = info["info"]["device"]
     dtype = info["info"]["dtype"]
diff --git a/graph_net/torch/backend/blade_disc_backend.py b/graph_net_bench/torch/backend/blade_disc_backend.py
similarity index 94%
rename from graph_net/torch/backend/blade_disc_backend.py
rename to graph_net_bench/torch/backend/blade_disc_backend.py
index 5af7b8490..42803a12c 100644
--- a/graph_net/torch/backend/blade_disc_backend.py
+++ b/graph_net_bench/torch/backend/blade_disc_backend.py
@@ -28,6 +28,9 @@ def compile(self, module, *args, **kwargs):
 
 
 class BladeDISCBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def __call__(self, model):
         return BladeDISCCompiledModule(model)
 
diff --git a/graph_net/torch/backend/graph_variable_renamer_validator_backend.py b/graph_net_bench/torch/backend/graph_variable_renamer_validator_backend.py
similarity index 95%
rename from graph_net/torch/backend/graph_variable_renamer_validator_backend.py
rename to graph_net_bench/torch/backend/graph_variable_renamer_validator_backend.py
index 902dd761c..ab2018b7d 100755
--- a/graph_net/torch/backend/graph_variable_renamer_validator_backend.py
+++ b/graph_net_bench/torch/backend/graph_variable_renamer_validator_backend.py
@@ -4,6 +4,7 @@
 from graph_net.tensor_meta import TensorMeta
 import os
 import importlib.util
+from .graph_compiler_backend import GraphCompilerBackend
 
 
 class RenamedModelAdapter(torch.nn.Module):
@@ -27,7 +28,10 @@ def _convert_by_name_mapping(self, kwargs):
         return new_kwargs
 
 
-class GraphVariableRenamerValidatorBackend:
+class GraphVariableRenamerValidatorBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def _get_rename_mapping(self, model_dir: Path):
         mapping = {}
         for meta_file in ["input_meta.py", "weight_meta.py"]:
diff --git a/graph_net/torch/backend/inductor_backend.py b/graph_net_bench/torch/backend/inductor_backend.py
similarity index 82%
rename from graph_net/torch/backend/inductor_backend.py
rename to graph_net_bench/torch/backend/inductor_backend.py
index e39a9d08f..5200e3032 100644
--- a/graph_net/torch/backend/inductor_backend.py
+++ b/graph_net_bench/torch/backend/inductor_backend.py
@@ -3,6 +3,9 @@
 
 
 class InductorBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def __call__(self, model):
         return torch.compile(model, backend="inductor")
 
diff --git a/graph_net/torch/backend/range_decomposer_validator_backend.py b/graph_net_bench/torch/backend/range_decomposer_validator_backend.py
similarity index 95%
rename from graph_net/torch/backend/range_decomposer_validator_backend.py
rename to graph_net_bench/torch/backend/range_decomposer_validator_backend.py
index 375c562c9..c260f47b8 100644
--- a/graph_net/torch/backend/range_decomposer_validator_backend.py
+++ b/graph_net_bench/torch/backend/range_decomposer_validator_backend.py
@@ -4,6 +4,7 @@
 import os
 import importlib.util
 from typing import List
+from .graph_compiler_backend import GraphCompilerBackend
 
 
 class ComposedModel(nn.Module):
@@ -37,7 +38,10 @@ def _convert_inputs(self, subgraph, input_kwargs):
         }
 
 
-class RangeDecomposerValidatorBackend:
+class RangeDecomposerValidatorBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def _load_model_instance(self, path: str, device: str) -> torch.nn.Module:
         class_name = "GraphModule"
         model_file = os.path.join(path, "model.py")
diff --git a/graph_net/torch/backend/tensorrt_backend.py b/graph_net_bench/torch/backend/tensorrt_backend.py
similarity index 88%
rename from graph_net/torch/backend/tensorrt_backend.py
rename to graph_net_bench/torch/backend/tensorrt_backend.py
index 5fa8524b3..e0490c48e 100644
--- a/graph_net/torch/backend/tensorrt_backend.py
+++ b/graph_net_bench/torch/backend/tensorrt_backend.py
@@ -8,6 +8,9 @@
 
 
 class TensorRTBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def __call__(self, model):
         if torch_tensorrt is None:
             raise ImportError("torch_tensorrt not installed")
diff --git a/graph_net/torch/backend/tvm_backend.py b/graph_net_bench/torch/backend/tvm_backend.py
similarity index 96%
rename from graph_net/torch/backend/tvm_backend.py
rename to graph_net_bench/torch/backend/tvm_backend.py
index 4577bfcae..435999f18 100644
--- a/graph_net/torch/backend/tvm_backend.py
+++ b/graph_net_bench/torch/backend/tvm_backend.py
@@ -1,6 +1,5 @@
 import torch
 import inspect
-import numpy as np
 from .graph_compiler_backend import GraphCompilerBackend
 
 try:
@@ -59,6 +58,9 @@ def compile(self, module, **kwargs):
 
 
 class TvmBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def __call__(self, model, **kwargs):
         if torch.cuda.is_available():
             device = "cuda"
@@ -75,5 +77,5 @@ def version(self):
             from importlib.metadata import version
 
             return version("tvm")
-        except:
+        except ImportError:
             return "unknown"
diff --git a/graph_net/torch/backend/unstable_to_stable_backend.py b/graph_net_bench/torch/backend/unstable_to_stable_backend.py
similarity index 99%
rename from graph_net/torch/backend/unstable_to_stable_backend.py
rename to graph_net_bench/torch/backend/unstable_to_stable_backend.py
index c85497ab5..bca0792c7 100644
--- a/graph_net/torch/backend/unstable_to_stable_backend.py
+++ b/graph_net_bench/torch/backend/unstable_to_stable_backend.py
@@ -2,10 +2,13 @@
 import torch
 import sys
 from .graph_compiler_backend import GraphCompilerBackend
-from ..fx_graph_serialize_util import serialize_graph_module_to_str
+from graph_net.torch.fx_graph_serialize_util import serialize_graph_module_to_str
 
 
 class UnstableToStableBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def __call__(self, model):
         # Perform unstable API check before running the model
         unstable_api = os.getenv("DISALLOWED_UNSTABLE_API", "").strip()
diff --git a/graph_net/torch/backend/xla_backend.py b/graph_net_bench/torch/backend/xla_backend.py
similarity index 93%
rename from graph_net/torch/backend/xla_backend.py
rename to graph_net_bench/torch/backend/xla_backend.py
index 2c4d253b1..4793d8687 100644
--- a/graph_net/torch/backend/xla_backend.py
+++ b/graph_net_bench/torch/backend/xla_backend.py
@@ -25,6 +25,9 @@ def forward(self, **kwargs):
 
 
 class XlaBackend(GraphCompilerBackend):
+    def __init__(self, config):
+        super().__init__(config)
+
     def __call__(self, model):
         if torch_xla is None:
             raise ImportError("torch_xla not installed")
diff --git a/graph_net_bench/torch/test_compiler.py b/graph_net_bench/torch/test_compiler.py
index cf801db12..08f0ac63d 100755
--- a/graph_net_bench/torch/test_compiler.py
+++ b/graph_net_bench/torch/test_compiler.py
@@ -15,15 +15,37 @@
 import platform
 import base64
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
+from graph_net_bench.torch.backend.tvm_backend import TvmBackend
+from graph_net_bench.torch.backend.xla_backend import XlaBackend
+from graph_net_bench.torch.backend.inductor_backend import InductorBackend
+from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
+from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
 from graph_net_bench.torch.backend.nope_backend import NopeBackend
 from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
+from graph_net_bench.torch.backend.unstable_to_stable_backend import (
+    UnstableToStableBackend,
+)
+from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
+    RangeDecomposerValidatorBackend,
+)
+from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
+    GraphVariableRenamerValidatorBackend,
+)
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
 
 
 compiler_backend_name2class = {
+    "tvm": TvmBackend,
+    "xla": XlaBackend,
+    "inductor": InductorBackend,
+    "tensorrt": TensorRTBackend,
+    "bladedisc": BladeDISCBackend,
     "nope": NopeBackend,
     "pass_mgr": PassMgrBackend,
+    "unstable_to_stable": UnstableToStableBackend,
+    "range_decomposer_validator": RangeDecomposerValidatorBackend,
+    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
 }
 
 
diff --git a/graph_net_bench/torch/utils.py b/graph_net_bench/torch/utils.py
index 62837fc8c..c937ff4de 100755
--- a/graph_net_bench/torch/utils.py
+++ b/graph_net_bench/torch/utils.py
@@ -7,17 +7,6 @@
 kLiteralTensorSize = 64
 
 
-def apply_templates(forward_code: str) -> str:
-    tab = "    "
-    forward_code = f"\n{tab}".join(forward_code.split("\n"))
-    imports = "import torch"
-    if "device" in forward_code:
-        imports += "\n\nfrom torch import device"
-    if "inf" in forward_code:
-        imports += "\n\nfrom torch import inf"
-    return f"{imports}\n\nclass GraphModule(torch.nn.Module):\n{tab}{forward_code}"
-
-
 def get_limited_precision_float_str(value):
     if not isinstance(value, float):
         return value
@@ -28,230 +17,6 @@ def get_limited_precision_float_str(value):
     return f"{value:.3f}"
 
 
-def convert_state_and_inputs_impl(state_dict, example_inputs):
-    def tensor_info(tensor):
-        is_float = tensor.dtype.is_floating_point
-        mean = float(tensor.mean().item()) if is_float else None
-        std = None
-        if is_float:
-            if tensor.numel() <= 1:
-                std = 0.0
-            else:
-                std = float(tensor.std().item())
-        return {
-            "shape": list(tensor.shape),
-            "dtype": str(tensor.dtype),
-            "device": str(tensor.device),
-            "mean": get_limited_precision_float_str(mean),
-            "std": get_limited_precision_float_str(std),
-        }
-
-    def process_tensor(tensor):
-        if not isinstance(tensor, torch.Tensor):
-            return {"type": "unknown", "value": tensor}
-
-        info = tensor_info(tensor)
-        if tensor.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
-            if tensor.numel() < kLiteralTensorSize:
-                return {
-                    "type": "small_int_tensor",
-                    "data": tensor.clone(),
-                    "info": info,
-                }
-            else:
-                return {
-                    "type": "big_int_tensor_by_range",
-                    "min_val": tensor.min().item(),
-                    "max_val": tensor.max().item(),
-                    "info": info,
-                }
-        elif tensor.numel() < kLiteralTensorSize:
-            return {"type": "small_tensor", "data": tensor.clone(), "info": info}
-        else:
-            return {"type": "random_tensor", "info": info}
-
-    if isinstance(example_inputs, torch.Tensor):
-        processed_inputs = process_tensor(example_inputs)
-    elif isinstance(example_inputs, (list, tuple)):
-        processed_inputs = [process_tensor(t) for t in example_inputs]
-    else:
-        processed_inputs = {"type": "unknown", "value": example_inputs}
-
-    def handle_named_tensors(tensor):
-        if not isinstance(tensor, torch.Tensor):
-            return {"type": "unknown", "value": tensor}
-        info = tensor_info(tensor)
-        if tensor.dtype in [torch.int8, torch.int16, torch.int32, torch.int64]:
-            if tensor.numel() < kLiteralTensorSize:
-                return {
-                    "info": info,
-                    "data": tensor.clone(),
-                    "type": "small_int_tensor",
-                }
-            else:
-                return {
-                    "info": info,
-                    "min_val": tensor.min().item(),
-                    "max_val": tensor.max().item(),
-                    "type": "big_int_tensor_by_range",
-                }
-        if tensor.numel() < kLiteralTensorSize:
-            return {"info": info, "data": tensor.clone(), "type": "small_tensor"}
-        else:
-            return {"info": info, "data": None, "type": "random_tensor"}
-
-    processed_weights = {
-        key: handle_named_tensors(tensor) for key, tensor in state_dict.items()
-    }
-
-    # dynamic_shapes = extract_dynamic_shapes(example_inputs)
-    return {
-        "input_info": processed_inputs,
-        "weight_info": processed_weights,
-        "dynamic_shapes": None,
-    }
-
-
-def convert_state_and_inputs(state_dict, example_inputs):
-    return convert_state_and_inputs_impl(state_dict, example_inputs)
-
-
-def save_constraints_text(converted, file_path):
-    lines = []
-    if converted["dynamic_shapes"] is not None:
-        raise NotImplementedError("Handling constraints is not implemented yet.")
-    with open(file_path, "w") as f:
-        f.write("\n".join(lines))
-
-
-def save_converted_to_text(converted, file_path):
-    def format_data(data):
-        if data is None:
-            return "None"
-        elif isinstance(data, torch.Tensor):
-            if data.dtype.is_floating_point:
-
-                def float_to_str(x):
-                    if math.isinf(x):
-                        return "float('inf')" if x > 0 else "float('-inf')"
-                    if math.isnan(x):
-                        return "float('nan')"
-                    return f"{x:.6f}"
-
-                return "[{}]".format(
-                    ", ".join(float_to_str(x) for x in data.flatten().tolist())
-                )
-            else:
-                return "[{}]".format(", ".join(f"{x}" for x in data.flatten().tolist()))
-        else:
-            return repr(data)
-
-    def process_tensor_info(tensor_info, name_prefix="example_input"):
-        tensor_type = tensor_info.get("type")
-        info = tensor_info.get("info", {})
-        dtype = info.get("dtype", "torch.float")
-        shape = info.get("shape", [])
-        device = info.get("device", "cpu")
-        mean = info.get("mean", 0.0)
-        std = info.get("std", 1.0)
-        uid = f"{name_prefix}_tensor_meta_{tensor_info.get('name', '')}"
-
-        lines = [
-            (f"class {uid}:"),
-            (f"\tname = \"{tensor_info.get('name', '')}\""),
-            (f"\tshape = {shape}"),
-            (f'\tdtype = "{dtype}"'),
-            (f'\tdevice = "{device}"'),
-            (f"\tmean = {get_limited_precision_float_str(mean)}"),
-            (f"\tstd = {get_limited_precision_float_str(std)}"),
-        ]
-        if tensor_type == "big_int_tensor_by_range":
-            lines.append(f"\tmin_val = {tensor_info['min_val']}")
-            lines.append(f"\tmax_val = {tensor_info['max_val']}")
-        elif "data" in tensor_info:
-            data_list = (
-                tensor_info["data"].flatten()
-                if isinstance(tensor_info["data"], torch.Tensor)
-                else tensor_info["data"]
-            )
-            lines.append(f"\tdata = {format_data(data_list)}")
-
-        lines.append("")
-        return lines
-
-    input_infos = converted["input_info"]
-    if isinstance(input_infos, dict):
-        input_infos = [input_infos]
-
-    input_lines = []
-    for idx, input_info in enumerate(input_infos):
-        input_info["name"] = f"input_{idx}"
-        input_lines.extend(process_tensor_info(input_info, name_prefix="Program_input"))
-
-    with open(f"{file_path}/input_meta.py", "w") as f:
-        f.write("\n".join(input_lines))
-
-    weight_lines = []
-    for name, weight_info in converted["weight_info"].items():
-        weight_info["name"] = name
-        weight_lines.extend(
-            process_tensor_info(weight_info, name_prefix="Program_weight")
-        )
-
-    with open(f"{file_path}/weight_meta.py", "w") as f:
-        f.write("\n".join(weight_lines))
-
-
-def load_model_inputs_converted_from_text(file_path):
-    return load_converted_from_text(file_path)
-
-
-def load_converted_from_text(file_path):
-    input_info = list(convert_meta_classes_to_tensors(f"{file_path}/input_meta.py"))
-
-    weight_info = {
-        data["name"]: data
-        for data in convert_meta_classes_to_tensors(f"{file_path}/weight_meta.py")
-    }
-
-    return {
-        "input_info": input_info,
-        "weight_info": weight_info,
-        "dynamic_shapes": None,
-    }
-
-
-def convert_tensor_meta_attrs_list_to_named_tensors(tensor_meta_attrs_list):
-    tensors_wrappers = convert_tensor_meta_attrs_list_to_tensors_wrappers(
-        tensor_meta_attrs_list
-    )
-    ret = []
-    for i, tensors_wrapper in enumerate(tensors_wrappers):
-        name = tensors_wrapper["name"]
-        # shape = tensors_wrapper["info"]['shape']
-        # logging.warning(f"before replay_tensor {i=} {shape=}")
-        tensor = replay_tensor(tensors_wrapper)
-        # logging.warning(f"after replay_tensor {i=} {shape=}")
-        ret.append((name, tensor))
-    return ret
-
-
-def get_named_tensors(tensor_meta_attrs_list, use_dummy_inputs):
-    tensors_wrappers = convert_tensor_meta_attrs_list_to_tensors_wrappers(
-        tensor_meta_attrs_list
-    )
-    ret = []
-    for i, tensors_wrapper in enumerate(tensors_wrappers):
-        name = tensors_wrapper["name"]
-        # shape = tensors_wrapper["info"]['shape']
-        if use_dummy_inputs:
-            tensor = get_dummy_tensor(tensors_wrapper)
-        else:
-            tensor = replay_tensor(tensors_wrapper)
-        ret.append((name, tensor))
-    return ret
-
-
 def convert_meta_classes_to_tensors(file_path):
     tensor_meta_attrs_list = [
         {
@@ -324,8 +89,19 @@ def _get_classes(file_path):
     yield from inspect.getmembers(unnamed, inspect.isclass)
 
 
-def extract_dynamic_shapes(example_inputs):
-    pass
+def load_converted_from_text(file_path):
+    input_info = list(convert_meta_classes_to_tensors(f"{file_path}/input_meta.py"))
+
+    weight_info = {
+        data["name"]: data
+        for data in convert_meta_classes_to_tensors(f"{file_path}/weight_meta.py")
+    }
+
+    return {
+        "input_info": input_info,
+        "weight_info": weight_info,
+        "dynamic_shapes": None,
+    }
 
 
 def replay_tensor(info):