Clean up logs (#398)

xiangxu-google · web-flow · commit 1b2368fdce6e · 2025-08-05T09:59:09.000-07:00
Signed-off-by: Xiang Xu &lt;xiangxu@google.com&gt;
diff --git a/README.md b/README.md
@@ -40,7 +40,6 @@ Run `Llama 3.1 8B` offline inference on 4 TPU chips:
 HF_TOKEN=<huggingface_token> python tpu_commons/examples/offline_inference.py \
     --model=meta-llama/Llama-3.1-8B \
     --tensor_parallel_size=4 \
-    --task=generate \
     --max_model_len=1024
 ```
 
@@ -51,7 +50,6 @@ Run `Llama 3.1 8B Instruct` offline inference on 4 TPU chips in disaggregated mo
 ```
 PREFILL_SLICES=2 DECODE_SLICES=2 HF_TOKEN=<huggingface_token> \
 python tpu_commons/examples/offline_inference.py \
-    --task=generate \
     --model=meta-llama/Meta-Llama-3-8B-Instruct \
     --max_model_len=1024 \
     --max_num_seqs=8
@@ -80,7 +78,6 @@ Run `Llama 3.1 70B Instruct` offline inference on 4 hosts (v6e-16) in interleave
 HF_TOKEN=<huggingface_token> python /workspace/tpu_commons/examples/offline_inference.py \
     --model=meta-llama/Llama-3.1-70B  \
     --tensor_parallel_size=16  \
-    --task=generate  \
     --max_model_len=1024
 ```
 
@@ -94,7 +91,6 @@ export HF_TOKEN=<huggingface_token>
 python tpu_commons/examples/offline_inference.py \
     --model=meta-llama/Llama-3.1-8B \
     --tensor_parallel_size=4 \
-    --task=generate \
     --max_model_len=1024
 ```
 
@@ -106,7 +102,6 @@ export HF_TOKEN=<huggingface_token>
 python vllm/examples/offline_inference/basic/generate.py \
     --model=Qwen/Qwen3-30B-A3B \
     --tensor_parallel_size=4 \
-    --task=generate \
     --max_model_len=1024 \
     --enable-expert-parallel
 ```
@@ -205,7 +200,6 @@ docker run \
   python /workspace/tpu_commons/examples/offline_inference.py \
   --model=meta-llama/Llama-3.1-8B \
   --tensor_parallel_size=4 \
-  --task=generate \
   --max_model_len=1024 \
 ```
 
diff --git a/tpu_commons/models/jax/model_loader.py b/tpu_commons/models/jax/model_loader.py
@@ -220,7 +220,7 @@ def get_model(
     mesh: Mesh,
 ) -> Any:
     impl = os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower()
-    logger.info(f"Loading model, implementation type={impl}")
+    logger.info(f"Loading model with MODEL_IMPL_TYPE={impl}")
     if impl == "flax_nnx":
         return get_flax_model(vllm_config, rng, mesh)
     elif impl == "vllm":
diff --git a/tpu_commons/models/jax/utils/weight_utils.py b/tpu_commons/models/jax/utils/weight_utils.py
@@ -57,7 +57,7 @@ def hf_model_weights_iterator(
     weights_files = []
     weights_location = "local"
     if os.path.isdir(model_name_or_path):
-        logger.info(f"Loading weights locally from: {model_name_or_path}")
+        logger.info(f"Found weights from local: {model_name_or_path}")
         weights_files = glob.glob(
             os.path.join(model_name_or_path, HF_WEIGHTS_FORMAT))
     elif file_utils.is_gcs_path(model_name_or_path):
diff --git a/tpu_commons/platforms/tpu_jax.py b/tpu_commons/platforms/tpu_jax.py
@@ -68,6 +68,7 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
+        logger.info(jax.lib.xla_bridge.get_backend().platform_version)
         try:
             if envs.VLLM_TPU_USING_PATHWAYS:
                 return jax.local_devices()[0].device_kind
@@ -174,14 +175,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
         if not multihost_backend:  # Single host
-            logger.warning(
-                "JAX requires to use uniproc_executor for single host.")
+            logger.info("Force using UniProcExecutor for JAX on single host.")
             parallel_config.distributed_executor_backend = "uni"
         elif multihost_backend == "ray":
             from tpu_commons.executors.ray_distributed_executor import \
                 RayDistributedExecutor
             parallel_config.distributed_executor_backend = RayDistributedExecutor
-            logger.info("Using Ray as the TPU multihost backend. ")
+            logger.info(
+                "Force using RayDistributedExecutor for JAX on single host.")
         else:
             logger.warning(
                 f"Unknown TPU multihost backend: {multihost_backend}. "
diff --git a/tpu_commons/runner/jax/tpu_jax_runner.py b/tpu_commons/runner/jax/tpu_jax_runner.py
@@ -84,7 +84,6 @@ def __init__(
 
         self.maybe_forbid_compile = runner_utils.ForbidCompile(
         ) if envs.VLLM_XLA_CHECK_RECOMPILATION else nullcontext()
-        logger.info("TPUModelRunner created!")
 
     def _verify_chunked_prefill_config(self):
         if (self.scheduler_config.max_num_batched_tokens
@@ -106,9 +105,6 @@ def _init_mesh(self) -> None:
             sharding_strategy = \
                 self.vllm_config.additional_config["sharding"]["sharding_strategy"]
         except KeyError:
-            logger.warning(
-                f"No sharding strategy passed! Using default of full model parallelism={len(self.devices)}"
-            )
             sharding_strategy = {"tensor_parallelism": len(self.devices)}
 
         if os.getenv("NEW_MODEL_DESIGN", False):
@@ -120,20 +116,12 @@ def _init_mesh(self) -> None:
             try:
                 dp = sharding_strategy["data_parallelism"]
             except KeyError:
-                logger.warning(
-                    "No data parallelism passed! Using default value of 1")
                 dp = 1
-
             try:
                 tp = sharding_strategy["tensor_parallelism"]
             except KeyError:
-                logger.warning(
-                    f"No tensor parallelism passed! Using default value of {len(self.devices)}"
-                )
                 tp = len(self.devices)
 
-            tp = sharding_strategy["tensor_parallelism"]
-
             axis_names = ("data", "model")
             mesh_shape = (dp, tp)
 
@@ -277,8 +265,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(self.kv_caches)
 
-        logger.info(jax.lib.xla_bridge.get_backend().platform_version)
-
     def _precompile_backbone(self) -> None:
         for num_tokens in self.num_tokens_paddings:
             input_ids = np.ones((num_tokens, ), dtype=np.int32)