From e80082c4354b3d18cc1f63354f9bebf1e83117ce Mon Sep 17 00:00:00 2001 From: cangtianhuang Date: Sat, 2 Aug 2025 12:13:12 +0800 Subject: [PATCH 1/5] update --- tools/api_tracer/framework_dialect.py | 60 ++++++++++++++++++--------- tools/api_tracer/test_infer.py | 8 ++-- tools/api_tracer/test_train.py | 38 +++++++---------- 3 files changed, 59 insertions(+), 47 deletions(-) diff --git a/tools/api_tracer/framework_dialect.py b/tools/api_tracer/framework_dialect.py index b2fcb570..52b72f01 100644 --- a/tools/api_tracer/framework_dialect.py +++ b/tools/api_tracer/framework_dialect.py @@ -19,9 +19,15 @@ class TracingHook(abc.ABC): """钩子的抽象基类""" - def __init__(self, serializer: "ConfigSerializer", level: int): + def __init__( + self, + serializer: "ConfigSerializer", + level: int, + dialect: Optional["FrameworkDialect"] = None, + ): self.serializer = serializer self.level = level + self.dialect = dialect @abc.abstractmethod def install(self): @@ -40,8 +46,7 @@ def __init__( level: int, dialect: "FrameworkDialect", ): - super().__init__(serializer, level) - self.dialect = dialect + super().__init__(serializer, level, dialect) self._original_apis: Dict[str, Any] = {} self._module_cache: Dict[str, Any] = {} @@ -85,9 +90,12 @@ def wrapper(*args, **kwargs): return wrapper def install(self): + if self.dialect is None: + return api_list = self.dialect.discover_apis() + self.dialect.discover_custom_ops() - # with open(os.path.join(os.path.dirname(__file__), "trace_output", "api_list.yaml"), "w") as f: + # api_path = os.path.join(os.path.dirname(__file__), "trace_output/api_list.yaml") + # with open(api_path, "w") as f: # yaml.dump(api_list, f) print(f"[SetattrHook] Attempting to patch {len(api_list)} APIs...") @@ -153,7 +161,8 @@ def install(self): f"[SetattrHook] Patched {patched_count} APIs. Skipped {skipped_count} non-writable APIs." ) - # with open(os.path.join(os.path.dirname(__file__), "trace_output", "api_list_wrap.yaml"), "w") as f: + # api_path = os.path.join(os.path.dirname(__file__), "trace_output/api_list_wrap.yaml") + # with open(api_path, "w") as f: # yaml.dump(list(self._original_apis.keys()), f) def uninstall(self): @@ -171,9 +180,13 @@ def uninstall(self): class TorchFunctionModeTracer(torch.overrides.TorchFunctionMode): - def __init__(self, serializer: "ConfigSerializer", level: int): + def __init__( + self, serializer: "ConfigSerializer", level: int, dialect: "FrameworkDialect" + ): self.serializer = serializer self.level = level + self.disable_torch_api_list = dialect.disable_torch_api_list + self.target_apis = dialect.target_apis # skip these for duplicate property access of paddle.Tensor in SetattrHook # (SetattrHook and TorchFunctionHook are installed at the same time) @@ -209,9 +222,11 @@ def __torch_function__(self, func, types, args=(), kwargs=None): class TorchFunctionHook(TracingHook): - def __init__(self, serializer: "ConfigSerializer", level: int): - super().__init__(serializer, level) - self.tracing_mode = TorchFunctionModeTracer(serializer, level) + def __init__( + self, serializer: "ConfigSerializer", level: int, dialect: "FrameworkDialect" + ): + super().__init__(serializer, level, dialect) + self.tracing_mode = TorchFunctionModeTracer(serializer, level, dialect) def install(self): print(f"[TorchFunctionHook] Enabling __torch_function__ tracing mode...") @@ -225,9 +240,13 @@ def uninstall(self): class TorchDispatchModeTracer(TorchDispatchMode): - def __init__(self, serializer: "ConfigSerializer", level: int): + def __init__( + self, serializer: "ConfigSerializer", level: int, dialect: "FrameworkDialect" + ): self.serializer = serializer self.level = level + self.disable_torch_api_list = dialect.disable_torch_api_list + self.target_apis = dialect.target_apis def __torch_dispatch__(self, func, types, args=(), kwargs=None): kwargs = kwargs or {} @@ -240,9 +259,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None): class TorchDispatchHook(TracingHook): - def __init__(self, serializer: "ConfigSerializer", level: int): - super().__init__(serializer, level) - self.tracing_mode = TorchDispatchModeTracer(serializer, level) + def __init__( + self, serializer: "ConfigSerializer", level: int, dialect: "FrameworkDialect" + ): + super().__init__(serializer, level, dialect) + self.tracing_mode = TorchDispatchModeTracer(serializer, level, dialect) def install(self): print(f"[TorchDispatchHook] Enabling __torch_dispatch__ tracing mode...") @@ -394,13 +415,14 @@ class PyTorchDialect(FrameworkDialect): # recommended to skip "__call__", "__format__", - "__instancecheck__", "__iter__", "__repr__", "__str__", + "__instancecheck__", "__subclasscheck__", "__subclasshook__", - # optional to skip + "__getstate__", + "__setstate__", "__enter__", "__exit__", } @@ -414,12 +436,13 @@ class PyTorchDialect(FrameworkDialect): "torch.cuda._sanitizer._TensorsAccessed", "torch.xpu._gpu_trace.CallbackRegistry", "torch.TypedStorage", + # "torch.optim.Optimizer", # methods "torch.autograd.function._is_setup_context_defined", + "torch.distributed.reduce_op", "torch.fx.experimental.unification.multipledispatch.dispatcher.str_signature", "torch.nn.functional.handle_torch_function", "torch.nn.functional.has_torch_function_unary", - "torch.distributed.reduce_op", } def get_framework_name(self) -> str: @@ -584,10 +607,7 @@ def get_hooks(self, serializer, levels: List[int], **kwargs) -> List[TracingHook for level in levels: hook_class = hook_map.get(level) if hook_class: - if level == 0: - hooks.append(hook_class(serializer, level, self)) - else: - hooks.append(hook_class(serializer, level)) + hooks.append(hook_class(serializer, level, self)) else: raise ValueError(f"Invalid level: {level}") return hooks diff --git a/tools/api_tracer/test_infer.py b/tools/api_tracer/test_infer.py index f07fd2f8..f30de4e3 100644 --- a/tools/api_tracer/test_infer.py +++ b/tools/api_tracer/test_infer.py @@ -1,6 +1,7 @@ import json import os import sys +import traceback import yaml @@ -58,11 +59,10 @@ def run_inference_test(model_name: str): print("\n--- Generated Response ---") print(response) print("--------------------------\n") - - except Exception as e: - print(f"An error occurred during inference for {model_name}: {e}") - finally: print(f"✅ Test for {model_name} finished.") + except Exception as e: + traceback.print_exc() + print(f"❌ An error occurred during inference for {model_name}: {e}") def main(): diff --git a/tools/api_tracer/test_train.py b/tools/api_tracer/test_train.py index d652cf22..aba18142 100644 --- a/tools/api_tracer/test_train.py +++ b/tools/api_tracer/test_train.py @@ -1,4 +1,6 @@ import os +import time +import traceback os.environ["HF_HOME"] = "tools/api_tracer/.huggingface" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" @@ -28,7 +30,7 @@ def run_training_test(model_name: str): print(f"🚀 Running training test for: {model_name})") output_path = f"tools/api_tracer/trace_output_test_train/{model_name}" - tracer = APITracer("torch", output_path=output_path, levels=[0, 1]) + tracer = APITracer("torch", output_path=output_path, levels=[0]) try: model = AutoModelForCausalLM.from_pretrained( @@ -48,10 +50,7 @@ def run_training_test(model_name: str): f.write(f"Model: {model.__class__}\n") f.write(f"Tokenizer: {tokenizer.__class__}\n") - dataset = load_dataset( - "lmsys/chatbot_arena_conversations", split="train", streaming=True - ) - dataset_sample = dataset.take(500) + dataset = load_dataset("lmsys/chatbot_arena_conversations", split="train[:500]") def preprocess_function(examples): all_texts = [] @@ -68,10 +67,11 @@ def preprocess_function(examples): all_texts.append(text_b) return tokenizer(all_texts, truncation=True, max_length=512) - tokenized_dataset = dataset_sample.map( + tokenized_dataset = dataset.map( preprocess_function, batched=True, - remove_columns=next(iter(dataset_sample)).keys(), + batch_size=100, + remove_columns=next(iter(dataset)).keys(), ) save_model_path = f"{output_path}/finetuned-arena" @@ -87,6 +87,7 @@ def preprocess_function(examples): max_steps=5, gradient_checkpointing=True, ) + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) trainer = Trainer( @@ -99,13 +100,10 @@ def preprocess_function(examples): with tracer: trainer.train() - final_model_path = f"{output_path}/finetuned-final" - trainer.save_model(final_model_path) - tokenizer.save_pretrained(final_model_path) - except Exception as e: - print(f"An error occurred during training for {model_name}: {e}") - finally: print(f"✅ Test for {model_name} finished.") + except Exception as e: + traceback.print_exc() + print(f"❌ An error occurred during training for {model_name}: {e}") def run_training_test_vision(model_name: str): @@ -114,8 +112,6 @@ def run_training_test_vision(model_name: str): tracer = APITracer("torch", output_path=output_path) try: - tracer.start() - model = AutoModelForImageTextToText.from_pretrained( model_name, torch_dtype=torch.bfloat16, @@ -222,17 +218,13 @@ def preprocess_function(examples): train_dataset=tokenized_dataset, data_collator=data_collator, ) - trainer.train() - final_model_path = f"{output_path}/finetuned-final" - trainer.save_model(final_model_path) - processor.save_pretrained(final_model_path) + with tracer: + trainer.train() - except Exception as e: - print(f"An error occurred during training for {model_name}: {e}") - finally: - tracer.stop() print(f"✅ Test for {model_name} finished.") + except Exception as e: + print(f"❌ An error occurred during training for {model_name}: {e}") def main(): From 7272fddda1d6426f68306c61446a27ddd44ef8de Mon Sep 17 00:00:00 2001 From: cangtianhuang Date: Sat, 2 Aug 2025 16:53:54 +0800 Subject: [PATCH 2/5] upgrade hooks --- tools/api_tracer/framework_dialect.py | 96 ++++++++++++++++----------- 1 file changed, 58 insertions(+), 38 deletions(-) diff --git a/tools/api_tracer/framework_dialect.py b/tools/api_tracer/framework_dialect.py index 52b72f01..cc7bba7d 100644 --- a/tools/api_tracer/framework_dialect.py +++ b/tools/api_tracer/framework_dialect.py @@ -5,7 +5,7 @@ import os import pkgutil import traceback -from functools import partial +import types from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union import torch @@ -114,29 +114,40 @@ def install(self): original_api = getattr(parent_obj, func_name) wrapper = None - if isinstance(original_api, property): - if original_api.fget and original_api.fset: - wrapped_getter = self._create_wrapper( - f"{api_name}.fget", - original_api.fget, - self.serializer, - self.level, - ) - wrapper = property( - wrapped_getter, - original_api.fset, - original_api.fdel, - original_api.__doc__, - ) + if isinstance( + original_api, + ( + types.FunctionType, + types.BuiltinFunctionType, + types.MethodType, + types.BuiltinMethodType, + ), + ): + wrapped_func = self._create_wrapper( + api_name, original_api, self.serializer, self.level + ) elif isinstance(original_api, (classmethod, staticmethod)): original_func = original_api.__func__ wrapped_func = self._create_wrapper( api_name, original_func, self.serializer, self.level ) wrapper = type(original_api)(wrapped_func) - elif callable(original_api): - wrapper = self._create_wrapper( - api_name, original_api, self.serializer, self.level + elif ( + isinstance(original_api, property) + and original_api.fget + and original_api.fset + ): + wrapped_getter = self._create_wrapper( + f"{api_name}.fget", + original_api.fget, + self.serializer, + self.level, + ) + wrapper = property( + wrapped_getter, + original_api.fset, + original_api.fdel, + original_api.__doc__, ) if wrapper: @@ -185,8 +196,8 @@ def __init__( ): self.serializer = serializer self.level = level - self.disable_torch_api_list = dialect.disable_torch_api_list - self.target_apis = dialect.target_apis + self.disable_torch_api_list = getattr(dialect, "disable_torch_api_list", False) + self.target_apis = getattr(dialect, "target_apis", []) # skip these for duplicate property access of paddle.Tensor in SetattrHook # (SetattrHook and TorchFunctionHook are installed at the same time) @@ -245,8 +256,8 @@ def __init__( ): self.serializer = serializer self.level = level - self.disable_torch_api_list = dialect.disable_torch_api_list - self.target_apis = dialect.target_apis + self.disable_torch_api_list = getattr(dialect, "disable_torch_api_list", False) + self.target_apis = getattr(dialect, "target_apis", []) def __torch_dispatch__(self, func, types, args=(), kwargs=None): kwargs = kwargs or {} @@ -436,13 +447,13 @@ class PyTorchDialect(FrameworkDialect): "torch.cuda._sanitizer._TensorsAccessed", "torch.xpu._gpu_trace.CallbackRegistry", "torch.TypedStorage", - # "torch.optim.Optimizer", # methods "torch.autograd.function._is_setup_context_defined", "torch.distributed.reduce_op", "torch.fx.experimental.unification.multipledispatch.dispatcher.str_signature", "torch.nn.functional.handle_torch_function", "torch.nn.functional.has_torch_function_unary", + "torch.optim.Optimizer.profile_hook_step", } def get_framework_name(self) -> str: @@ -503,7 +514,17 @@ def discover_apis(self) -> List[str]: continue if full_name in self.IGNORE_CLASSES_OR_METHODS: continue - if callable(obj) and not inspect.isclass(obj): + if isinstance( + obj, + ( + types.FunctionType, + types.BuiltinFunctionType, + types.MethodType, + types.BuiltinMethodType, + staticmethod, + classmethod, + ), + ): api_set.add(full_name) elif inspect.isclass(obj): # custom op class should be skip @@ -513,23 +534,22 @@ def discover_apis(self) -> List[str]: if cls_member_name in self.IGNORE_ATTRIBUTES: continue full_cls_name = f"{full_name}.{cls_member_name}" - if inspect.ismethod(cls_member) or inspect.isfunction( - cls_member + if isinstance( + cls_member, + ( + types.FunctionType, + types.BuiltinFunctionType, + types.MethodType, + types.BuiltinMethodType, + staticmethod, + classmethod, + ), ): api_set.add(full_cls_name) - elif isinstance(cls_member, (staticmethod, classmethod)): - api_set.add(full_cls_name) - elif isinstance(cls_member, property): - if cls_member.fget and cls_member.fset: - api_set.add(full_cls_name) - elif isinstance(cls_member, partial): - if hasattr( - cls_member.func, "__module__" - ) and cls_member.func.__module__.startswith("torch"): - api_set.add(full_cls_name) elif ( - hasattr(cls_member, "__isabstractmethod__") - and cls_member.__isabstractmethod__ + isinstance(cls_member, property) + and cls_member.fget + and cls_member.fset ): api_set.add(full_cls_name) except Exception as e: From 4ea3617537b343ec658def612652595b930e6377 Mon Sep 17 00:00:00 2001 From: cangtianhuang Date: Sat, 2 Aug 2025 16:54:04 +0800 Subject: [PATCH 3/5] update test --- tools/api_tracer/test_train.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/api_tracer/test_train.py b/tools/api_tracer/test_train.py index aba18142..0e6c170c 100644 --- a/tools/api_tracer/test_train.py +++ b/tools/api_tracer/test_train.py @@ -30,7 +30,7 @@ def run_training_test(model_name: str): print(f"🚀 Running training test for: {model_name})") output_path = f"tools/api_tracer/trace_output_test_train/{model_name}" - tracer = APITracer("torch", output_path=output_path, levels=[0]) + tracer = APITracer("torch", output_path=output_path, levels=[0, 1]) try: model = AutoModelForCausalLM.from_pretrained( @@ -74,14 +74,12 @@ def preprocess_function(examples): remove_columns=next(iter(dataset)).keys(), ) - save_model_path = f"{output_path}/finetuned-arena" training_args = TrainingArguments( - output_dir=save_model_path, per_device_train_batch_size=1, gradient_accumulation_steps=16, learning_rate=2e-5, logging_steps=20, - save_steps=5, + save_strategy="no", bf16=True, report_to="none", max_steps=5, @@ -194,14 +192,12 @@ def preprocess_function(examples): batch_size=4, ) - save_model_path = f"{output_path}/finetuned-turingeye" training_args = TrainingArguments( - output_dir=save_model_path, per_device_train_batch_size=1, gradient_accumulation_steps=4, learning_rate=1e-5, logging_steps=5, - save_steps=20, + save_strategy="no", bf16=True, report_to="none", max_steps=20, From f1d2bbc1ee9427b5412898528aecae0aa93aa821 Mon Sep 17 00:00:00 2001 From: cangtianhuang Date: Sat, 2 Aug 2025 16:54:41 +0800 Subject: [PATCH 4/5] update readme and .gitignore --- .gitignore | 4 +++- README.md | 2 +- engineV2-README.md | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index e4c58aa1..a8d5308a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,6 @@ __pycache__ log.log run.sh tester/api_config/**/test_log* -tools/api_tracer/.huggingface \ No newline at end of file +tester/api_config/api_config* +tools/api_tracer/.huggingface +tools/api_tracer/trace_output* \ No newline at end of file diff --git a/README.md b/README.md index 982dbf6d..8ec10a3c 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ paddle.concat(tuple(Tensor([31376, 768],"float32"),Tensor([1, 768],"float32"),), ``` - 安装第三方库: ```bash - pip install pebble pynvml pandas + pip install func_timeout pandas pebble pynvml pyyaml ``` 4. PaddlePaddle 与 PyTorch 的部分依赖项可能发生冲突,请先安装 *paddlepaddle-gpu* 再安装 *torch*,重新安装请在 pip 后添加 `--force-reinstall` 参数,仅更新 paddle 请添加 `--no-deps` 参数;engineV2 建议使用 python>=3.10 diff --git a/engineV2-README.md b/engineV2-README.md index 7f092ba6..c210742a 100644 --- a/engineV2-README.md +++ b/engineV2-README.md @@ -22,7 +22,7 @@ ```bash pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu118/ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 - pip install pebble pynvml pandas + pip install func_timeout pandas pebble pynvml pyyaml ``` 2. 克隆 PaddleAPITest 仓库并进入项目目录 ```bash From 7ca03a745be36c559c900c680698864dcf552692 Mon Sep 17 00:00:00 2001 From: cangtianhuang Date: Sat, 2 Aug 2025 17:41:51 +0800 Subject: [PATCH 5/5] refine --- .gitignore | 2 +- tools/error_stat/csv_stat_stable.py | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index a8d5308a..851c6790 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ run.sh tester/api_config/**/test_log* tester/api_config/api_config* tools/api_tracer/.huggingface -tools/api_tracer/trace_output* \ No newline at end of file +tools/api_tracer/trace_output* diff --git a/tools/error_stat/csv_stat_stable.py b/tools/error_stat/csv_stat_stable.py index d2ada87c..8c507f6d 100644 --- a/tools/error_stat/csv_stat_stable.py +++ b/tools/error_stat/csv_stat_stable.py @@ -1,6 +1,6 @@ # 整理 stable*.csv 精度统计数据,产出:stable_full.csv、stable_stat.csv、stable_stat_api.csv # @author: cangtianhuang -# @date: 2025-07-26 +# @date: 2025-07-30 import glob from collections import defaultdict @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -TEST_LOG_PATH = Path("tester/api_config/stable_csv") +TEST_LOG_PATH = Path("tester/api_config/test_log") OUTPUT_PATH = TEST_LOG_PATH OUTPUT_PATH.mkdir(parents=True, exist_ok=True) @@ -50,6 +50,10 @@ def process_chunk(chunk): comp = row["comp"] max_abs_diff = row["max_abs_diff"] max_rel_diff = row["max_rel_diff"] + + if np.isinf(max_rel_diff): + max_rel_diff = max_abs_diff + stats[(api, dtype, comp)]["abs_diffs"].append(max_abs_diff) stats[(api, dtype, comp)]["rel_diffs"].append(max_rel_diff) api_stats[api][dtype][comp] += 1 @@ -137,6 +141,18 @@ def parallel_process_csv(file_path, chunk_size=2000000): abs_diffs = np.array(stats[(api, dtype, comp)]["abs_diffs"], dtype=np.float64) rel_diffs = np.array(stats[(api, dtype, comp)]["rel_diffs"], dtype=np.float64) + count = len(abs_diffs) + + if not np.any(np.isnan(abs_diffs)): + abs_quantile = np.quantile(abs_diffs, 0.99) + filtered_abs = abs_diffs[abs_diffs <= abs_quantile] + abs_diffs = filtered_abs if len(filtered_abs) > 0 else abs_diffs + + if not np.any(np.isnan(rel_diffs)): + rel_quantile = np.quantile(rel_diffs, 0.99) + filtered_rel = rel_diffs[rel_diffs <= rel_quantile] + rel_diffs = filtered_rel if len(filtered_rel) > 0 else rel_diffs + stats_data.append( { "API": api, @@ -148,7 +164,7 @@ def parallel_process_csv(file_path, chunk_size=2000000): "rel_min": f"{np.min(rel_diffs):.6e}", "rel_max": f"{np.max(rel_diffs):.6e}", "rel_mean": f"{np.mean(rel_diffs):.6e}", - "count": len(abs_diffs), + "count": count, } )