diff --git a/skyrl-train/skyrl_train/generators/skyrl_gym_generator.py b/skyrl-train/skyrl_train/generators/skyrl_gym_generator.py index 23bcaf6695..eacf37677f 100644 --- a/skyrl-train/skyrl_train/generators/skyrl_gym_generator.py +++ b/skyrl-train/skyrl_train/generators/skyrl_gym_generator.py @@ -289,19 +289,15 @@ async def agent_loop( if retokenize_chat_history: reward_out = per_step_rewards[-1][0] else: - if all(reward is None for reward, _ in per_step_rewards[:-1]): - # If all rewards besides the last one are None (i.e. per-trajectory reward), we keep it as a float - reward_out = per_step_rewards[-1][0] - else: - # Otherwise build token-level rewards placed at assistant turn boundaries - token_level_rewards: List[float] = [0.0] * len(response_ids) - for step_reward, idx in per_step_rewards: - if step_reward is None: - continue - if idx >= len(response_ids): - break - token_level_rewards[idx] += step_reward - reward_out = token_level_rewards + # Build token-level rewards placed at assistant turn boundaries + token_level_rewards: List[float] = [0.0] * len(response_ids) + for step_reward, idx in per_step_rewards: + if step_reward is None: + continue + if idx >= len(response_ids): + break + token_level_rewards[idx] += step_reward + reward_out = token_level_rewards return AgentLoopOutput( response_ids=response_ids, diff --git a/skyrl-train/tests/cpu/generators/test_skyrl_gym_generator.py b/skyrl-train/tests/cpu/generators/test_skyrl_gym_generator.py index 29a5c6fef4..3f8865cf3c 100644 --- a/skyrl-train/tests/cpu/generators/test_skyrl_gym_generator.py +++ b/skyrl-train/tests/cpu/generators/test_skyrl_gym_generator.py @@ -965,9 +965,8 @@ def close(self): @pytest.mark.asyncio @patch("skyrl_gym.make") -@pytest.mark.parametrize("is_per_turn_reward", [True, False]) async def test_agent_loop_token_level_rewards_multi_turn_conversation_format( - mock_make, mock_tokenizer, mock_llm, mock_env_cfg, is_per_turn_reward + mock_make, mock_tokenizer, mock_llm, mock_env_cfg ): """use_conversation_multi_turn=True; verify rewards placed at ends of assistant segments before observations.""" mock_tokenizer.eos_token_id = 4 @@ -1008,10 +1007,7 @@ def step(self, action): self.turns += 1 if self.turns == 1: return BaseTextEnvStepOutput( - observations=[{"role": "user", "content": "obs1"}], - reward=0.5 if is_per_turn_reward else None, - done=False, - metadata={}, + observations=[{"role": "user", "content": "obs1"}], reward=0.5, done=False, metadata={} ) else: return BaseTextEnvStepOutput(observations=[], reward=0.25, done=True, metadata={}) @@ -1050,16 +1046,11 @@ def close(self): # Response ids layout: step1 assistant (4 incl. eos) + obs(2) + step2 assistant(4 incl. eos) = 10 assert len(out.response_ids) == 10 - if is_per_turn_reward: - # Rewards at indices: 3 (end of step1 assistant), 9 (end of step2 assistant) - expected = [0.0] * 10 - expected[3] = 0.5 - expected[9] = 0.25 - assert isinstance(out.reward, list) - else: - # Per-trajectory reward is a single float - expected = 0.25 - assert isinstance(out.reward, float) + # Rewards at indices: 3 (end of step1 assistant), 9 (end of step2 assistant) + expected = [0.0] * 10 + expected[3] = 0.5 + expected[9] = 0.25 + assert isinstance(out.reward, list) assert out.reward == expected assert out.stop_reason == "stop"