-
Notifications
You must be signed in to change notification settings - Fork 131
/
sb3_independent.py
199 lines (179 loc) · 6.13 KB
/
sb3_independent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import argparse
import gym
import supersuit as ss
import torch
import torch.nn.functional as F
# pip install git+https://github.com/Rohan138/marl-baselines3
from marl_baselines3 import IndependentPPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.vec_monitor import VecMonitor
from torch import nn
from social_dilemmas.envs.pettingzoo_env import parallel_env
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def parse_args():
parser = argparse.ArgumentParser("MARL-Baselines3 PPO with Independent Learning")
parser.add_argument(
"--env-name",
type=str,
default="harvest",
choices=["harvest", "cleanup"],
help="The SSD environment to use",
)
parser.add_argument(
"--num-agents",
type=int,
default=5,
help="The number of agents",
)
parser.add_argument(
"--rollout-len",
type=int,
default=1000,
help="length of training rollouts AND length at which env is reset",
)
parser.add_argument(
"--total-timesteps",
type=int,
default=5e8,
help="Number of environment timesteps",
)
parser.add_argument(
"--use-collective-reward",
type=bool,
default=False,
help="Give each agent the collective reward across all agents",
)
parser.add_argument(
"--inequity-averse-reward",
type=bool,
default=False,
help="Use inequity averse rewards from 'Inequity aversion \
improves cooperation in intertemporal social dilemmas'",
)
parser.add_argument(
"--alpha",
type=float,
default=5,
help="Advantageous inequity aversion factor",
)
parser.add_argument(
"--beta",
type=float,
default=0.05,
help="Disadvantageous inequity aversion factor",
)
args = parser.parse_args()
return args
# Use this with lambda wrapper returning observations only
class CustomCNN(BaseFeaturesExtractor):
"""
:param observation_space: (gym.Space)
:param features_dim: (int) Number of features extracted.
This corresponds to the number of unit for the last layer.
"""
def __init__(
self,
observation_space: gym.spaces.Box,
features_dim=128,
view_len=7,
num_frames=6,
fcnet_hiddens=[1024, 128],
):
super(CustomCNN, self).__init__(observation_space, features_dim)
# We assume CxHxW images (channels first)
# Re-ordering will be done by pre-preprocessing or wrapper
flat_out = num_frames * 6 * (view_len * 2 - 1) ** 2
self.conv = nn.Conv2d(
in_channels=num_frames * 3, # Input: (3 * 4) x 15 x 15
out_channels=num_frames * 6, # Output: 24 x 13 x 13
kernel_size=3,
stride=1,
padding="valid",
)
self.fc1 = nn.Linear(in_features=flat_out, out_features=fcnet_hiddens[0])
self.fc2 = nn.Linear(in_features=fcnet_hiddens[0], out_features=fcnet_hiddens[1])
def forward(self, observations) -> torch.Tensor:
# Convert to tensor, rescale to [0, 1], and convert from B x H x W x C to B x C x H x W
observations = observations.permute(0, 3, 1, 2)
features = torch.flatten(F.relu(self.conv(observations)), start_dim=1)
features = F.relu(self.fc1(features))
features = F.relu(self.fc2(features))
return features
def main(args):
# Config
env_name = args.env_name
num_agents = args.num_agents
rollout_len = args.rollout_len
total_timesteps = args.total_timesteps
use_collective_reward = args.use_collective_reward
inequity_averse_reward = args.inequity_averse_reward
alpha = args.alpha
beta = args.beta
# Training
num_cpus = 4 # number of cpus
num_envs = 12 # number of parallel multi-agent environments
num_frames = 6 # number of frames to stack together; use >4 to avoid automatic VecTransposeImage
features_dim = (
128 # output layer of cnn extractor AND shared layer for policy and value functions
)
fcnet_hiddens = [1024, 128] # Two hidden layers for cnn extractor
ent_coef = 0.001 # entropy coefficient in loss
batch_size = rollout_len * num_envs // 2 # This is from the rllib baseline implementation
lr = 0.0001
n_epochs = 30
gae_lambda = 1.0
gamma = 0.99
target_kl = 0.01
grad_clip = 40
verbose = 3
env = parallel_env(
max_cycles=rollout_len,
env=env_name,
num_agents=num_agents,
use_collective_reward=use_collective_reward,
inequity_averse_reward=inequity_averse_reward,
alpha=alpha,
beta=beta,
)
env = ss.observation_lambda_v0(env, lambda x, _: x["curr_obs"], lambda s: s["curr_obs"])
env = ss.frame_stack_v1(env, num_frames)
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(
env, num_vec_envs=num_envs, num_cpus=num_cpus, base_class="stable_baselines3"
)
env = VecMonitor(env)
policy_kwargs = dict(
features_extractor_class=CustomCNN,
features_extractor_kwargs=dict(
features_dim=features_dim, num_frames=num_frames, fcnet_hiddens=fcnet_hiddens
),
net_arch=[features_dim],
)
tensorboard_log = "./results/sb3/cleanup_ppo_independent"
model = IndependentPPO(
"CnnPolicy",
num_agents=num_agents,
env=env,
learning_rate=lr,
n_steps=rollout_len,
batch_size=batch_size,
n_epochs=n_epochs,
gamma=gamma,
gae_lambda=gae_lambda,
ent_coef=ent_coef,
max_grad_norm=grad_clip,
target_kl=target_kl,
policy_kwargs=policy_kwargs,
tensorboard_log=tensorboard_log,
verbose=verbose,
)
model.learn(total_timesteps=total_timesteps)
logdir = model.logger.dir
model.save(logdir)
del model
model = IndependentPPO.load( # noqa: F841
logdir, "CnnPolicy", num_agents, env, rollout_len, policy_kwargs, tensorboard_log, verbose
)
if __name__ == "__main__":
args = parse_args()
main(args)