Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
dongruoping committed Feb 11, 2021
1 parent 3fb14b9 commit 4e4ecad
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 26 deletions.
7 changes: 6 additions & 1 deletion gym-unity/gym_unity/tests/test_gym.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,12 @@ def create_mock_vector_steps(specs, num_agents=1, number_visual_observations=0):
] * number_visual_observations
rewards = np.array(num_agents * [1.0])
agents = np.array(range(0, num_agents))
return DecisionSteps(obs, rewards, agents, None), TerminalSteps.empty(specs)
group_id = np.array(num_agents * [0])
group_rewards = np.array(num_agents * [0.0])
return (
DecisionSteps(obs, rewards, agents, None, group_id, group_rewards),
TerminalSteps.empty(specs),
)


def setup_mock_unityenvironment(mock_env, mock_spec, mock_decision, mock_termination):
Expand Down
20 changes: 10 additions & 10 deletions ml-agents-envs/mlagents_envs/base_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ class DecisionStep(NamedTuple):

obs: List[np.ndarray]
reward: float
group_reward: float
agent_id: AgentId
action_mask: Optional[List[np.ndarray]]
group_id: int
group_reward: float


class DecisionSteps(Mapping):
Expand All @@ -83,13 +83,13 @@ class DecisionSteps(Mapping):
this simulation step.
"""

def __init__(self, obs, reward, group_reward, agent_id, action_mask, group_id):
def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward):
self.obs: List[np.ndarray] = obs
self.reward: np.ndarray = reward
self.group_reward: np.ndarray = group_reward
self.agent_id: np.ndarray = agent_id
self.action_mask: Optional[List[np.ndarray]] = action_mask
self.group_id: np.ndarray = group_id
self.group_reward: np.ndarray = group_reward
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

@property
Expand Down Expand Up @@ -128,10 +128,10 @@ def __getitem__(self, agent_id: AgentId) -> DecisionStep:
return DecisionStep(
obs=agent_obs,
reward=self.reward[agent_index],
group_reward=self.group_reward[agent_index],
agent_id=agent_id,
action_mask=agent_mask,
group_id=group_id,
group_reward=self.group_reward[agent_index],
)

def __iter__(self) -> Iterator[Any]:
Expand All @@ -149,10 +149,10 @@ def empty(spec: "BehaviorSpec") -> "DecisionSteps":
return DecisionSteps(
obs=obs,
reward=np.zeros(0, dtype=np.float32),
group_reward=np.zeros(0, dtype=np.float32),
agent_id=np.zeros(0, dtype=np.int32),
action_mask=None,
group_id=np.zeros(0, dtype=np.int32),
group_reward=np.zeros(0, dtype=np.float32),
)


Expand All @@ -170,10 +170,10 @@ class TerminalStep(NamedTuple):

obs: List[np.ndarray]
reward: float
group_reward: float
interrupted: bool
agent_id: AgentId
group_id: int
group_reward: float


class TerminalSteps(Mapping):
Expand All @@ -194,13 +194,13 @@ class TerminalSteps(Mapping):
across simulation steps.
"""

def __init__(self, obs, reward, group_reward, interrupted, agent_id, group_id):
def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward):
self.obs: List[np.ndarray] = obs
self.reward: np.ndarray = reward
self.group_reward: np.ndarray = group_reward
self.interrupted: np.ndarray = interrupted
self.agent_id: np.ndarray = agent_id
self.group_id: np.ndarray = group_id
self.group_reward: np.ndarray = group_reward
self._agent_id_to_index: Optional[Dict[AgentId, int]] = None

@property
Expand Down Expand Up @@ -235,10 +235,10 @@ def __getitem__(self, agent_id: AgentId) -> TerminalStep:
return TerminalStep(
obs=agent_obs,
reward=self.reward[agent_index],
group_reward=self.group_reward[agent_index],
interrupted=self.interrupted[agent_index],
agent_id=agent_id,
group_id=group_id,
group_reward=self.group_reward[agent_index],
)

def __iter__(self) -> Iterator[Any]:
Expand All @@ -256,10 +256,10 @@ def empty(spec: "BehaviorSpec") -> "TerminalSteps":
return TerminalSteps(
obs=obs,
reward=np.zeros(0, dtype=np.float32),
group_reward=np.zeros(0, dtype=np.float32),
interrupted=np.zeros(0, dtype=np.bool),
agent_id=np.zeros(0, dtype=np.int32),
group_id=np.zeros(0, dtype=np.int32),
group_reward=np.zeros(0, dtype=np.float32),
)


Expand Down
4 changes: 2 additions & 2 deletions ml-agents-envs/mlagents_envs/rpc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,18 +366,18 @@ def steps_from_proto(
DecisionSteps(
decision_obs_list,
decision_rewards,
decision_group_rewards,
decision_agent_id,
action_mask,
decision_group_id,
decision_group_rewards,
),
TerminalSteps(
terminal_obs_list,
terminal_rewards,
terminal_group_rewards,
max_step,
terminal_agent_id,
terminal_group_id,
terminal_group_rewards,
),
)

Expand Down
4 changes: 4 additions & 0 deletions ml-agents-envs/mlagents_envs/tests/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def test_decision_steps():
reward=np.array(range(3), dtype=np.float32),
agent_id=np.array(range(10, 13), dtype=np.int32),
action_mask=[np.zeros((3, 4), dtype=np.bool)],
group_id=np.array(range(3), dtype=np.int32),
group_reward=np.array(range(3), dtype=np.float32),
)

assert ds.agent_id_to_index[10] == 0
Expand Down Expand Up @@ -51,6 +53,8 @@ def test_terminal_steps():
reward=np.array(range(3), dtype=np.float32),
agent_id=np.array(range(10, 13), dtype=np.int32),
interrupted=np.array([1, 0, 1], dtype=np.bool),
group_id=np.array(range(3), dtype=np.int32),
group_reward=np.array(range(3), dtype=np.float32),
)

assert ts.agent_id_to_index[10] == 0
Expand Down
10 changes: 8 additions & 2 deletions ml-agents/mlagents/trainers/tests/mock_brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,21 @@ def create_mock_steps(
reward = np.array(num_agents * [1.0], dtype=np.float32)
interrupted = np.array(num_agents * [False], dtype=np.bool)
agent_id = np.arange(num_agents, dtype=np.int32)
group_id = np.array(num_agents * [0], dtype=np.int32)
group_reward = np.array(num_agents * [0.0], dtype=np.float32)
behavior_spec = BehaviorSpec(observation_specs, action_spec)
if done:
return (
DecisionSteps.empty(behavior_spec),
TerminalSteps(obs_list, reward, interrupted, agent_id),
TerminalSteps(
obs_list, reward, interrupted, agent_id, group_id, group_reward
),
)
else:
return (
DecisionSteps(obs_list, reward, agent_id, action_mask),
DecisionSteps(
obs_list, reward, agent_id, action_mask, group_id, group_reward
),
TerminalSteps.empty(behavior_spec),
)

Expand Down
61 changes: 50 additions & 11 deletions ml-agents/mlagents/trainers/tests/simple_test_envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,17 @@ def _reset_agent(self, name):
self.agent_id[name] = self.agent_id[name] + 1

def _make_batched_step(
self, name: str, done: bool, reward: float
self, name: str, done: bool, reward: float, group_reward: float
) -> Tuple[DecisionSteps, TerminalSteps]:
m_vector_obs = self._make_obs(self.goal[name])
m_reward = np.array([reward], dtype=np.float32)
m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
m_group_id = np.array([0], dtype=np.int32)
m_group_reward = np.array([group_reward], dtype=np.float32)
action_mask = self._generate_mask()
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
decision_step = DecisionSteps(
m_vector_obs, m_reward, m_agent_id, action_mask, m_group_id, m_group_reward
)
terminal_step = TerminalSteps.empty(self.behavior_spec)
if done:
self.final_rewards[name].append(self.rewards[name])
Expand All @@ -182,24 +186,45 @@ def _make_batched_step(
new_done,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
) = self._construct_reset_step(name)

decision_step = DecisionSteps(
new_vector_obs, new_reward, new_agent_id, new_action_mask
new_vector_obs,
new_reward,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
)
terminal_step = TerminalSteps(
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
m_vector_obs,
m_reward,
np.array([False], dtype=np.bool),
m_agent_id,
m_group_id,
m_group_reward,
)
return (decision_step, terminal_step)

def _construct_reset_step(
self, name: str
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
new_reward = np.array([0.0], dtype=np.float32)
new_done = np.array([False], dtype=np.bool)
new_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
new_action_mask = self._generate_mask()
return new_reward, new_done, new_agent_id, new_action_mask
new_group_id = np.array([0], dtype=np.int32)
new_group_reward = np.array([0.0], dtype=np.float32)
return (
new_reward,
new_done,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
)

def step(self) -> None:
assert all(action is not None for action in self.action.values())
Expand All @@ -208,12 +233,12 @@ def step(self) -> None:
done = self._take_action(name)
reward = self._compute_reward(name, done)
self.rewards[name] += reward
self.step_result[name] = self._make_batched_step(name, done, reward)
self.step_result[name] = self._make_batched_step(name, done, reward, 0.0)

def reset(self) -> None: # type: ignore
for name in self.names:
self._reset_agent(name)
self.step_result[name] = self._make_batched_step(name, False, 0.0)
self.step_result[name] = self._make_batched_step(name, False, 0.0, 0.0)

@property
def reset_parameters(self) -> Dict[str, str]:
Expand All @@ -231,14 +256,16 @@ def __init__(self, brain_names, action_sizes=(1, 0), step_size=0.2):
self.num_show_steps = 2

def _make_batched_step(
self, name: str, done: bool, reward: float
self, name: str, done: bool, reward: float, group_reward: float
) -> Tuple[DecisionSteps, TerminalSteps]:
recurrent_obs_val = (
self.goal[name] if self.step_count[name] <= self.num_show_steps else 0
)
m_vector_obs = self._make_obs(recurrent_obs_val)
m_reward = np.array([reward], dtype=np.float32)
m_agent_id = np.array([self.agent_id[name]], dtype=np.int32)
m_group_id = np.array([0], dtype=np.int32)
m_group_reward = np.array([group_reward], dtype=np.float32)
action_mask = self._generate_mask()
decision_step = DecisionSteps(m_vector_obs, m_reward, m_agent_id, action_mask)
terminal_step = TerminalSteps.empty(self.behavior_spec)
Expand All @@ -254,12 +281,24 @@ def _make_batched_step(
new_done,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
) = self._construct_reset_step(name)
decision_step = DecisionSteps(
new_vector_obs, new_reward, new_agent_id, new_action_mask
new_vector_obs,
new_reward,
new_agent_id,
new_action_mask,
new_group_id,
new_group_reward,
)
terminal_step = TerminalSteps(
m_vector_obs, m_reward, np.array([False], dtype=np.bool), m_agent_id
m_vector_obs,
m_reward,
np.array([False], dtype=np.bool),
m_agent_id,
m_group_id,
m_group_reward,
)
return (decision_step, terminal_step)

Expand Down

0 comments on commit 4e4ecad

Please sign in to comment.