-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagent.py
194 lines (147 loc) · 6.81 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
""" Contains the implementation of a generic temporal difference learning
agent."""
# ----- generic imports -----
import numpy as np
import random
import copy
class Agent:
""" A base class for temporal difference learning agents.
An agent always has observability of all nodes of the MAS, as the Q-table
and policies are learned on the joint state and action space. Depending on
its type, an agent may have control over the actions of a subset of the
nodes, so other agents need to communicate their actions. This is useful
for centralized and joint learners. Independent learners should override
the __init__ function and define the state-action space accordingly.
Note: This is an interface, do not create objects of this class.
Attributes:
state_space (tuple of int): the dimensions of the state space. each node
of the MAS is another dimension
action_space (tuple of int): the dimensions of the action space. each node
controlled by the agent contributes two dimensions (execute and off-load)
learn_space (tuple of int): the dimensions of the Q-table
Qtable (array of float): an array of dimension learn_space
nodes (list of Node): all nodes of the MAS
current_state (list of int): of dimension state_space
current_action (list of int): the action the agent intends to execute in
the next step
learn_parameters (dict of str: float): contains the values of different
hyperparameters including learning rate, discount factor, exploration rate
log (dict of values): used for logging information for debugging and
plotting
"""
def __init__(self, nodes, epsilon, alpha, gamma):
""" Initializes an agent.
Args:
nodes (list of :obj:`Node`): nodes comprising the MAS
epsilon (float): exploration rate
alpha (float): learning rate
gamma (float): discount factor
"""
# ----- initialize Qtable -----
self.nodes = nodes
# define state space
state_space = []
for node in nodes:
state_range = node.capacity + 2
state_space.append(state_range) # [0,capacity]
self.state_space = tuple(state_space)
# define action space
action_space = []
for node in nodes:
action_space.append(2) # execute action
action_space.append(len(node.neighbors)) # off-load actions
self.action_space = tuple(action_space)
self.learn_space = self.state_space + self.action_space
self.Qtable = np.random.uniform(low=0, high=0.001, size=self.learn_space)
# initialize current state and action
self.current_state = [0] * len(nodes) # initial state
self.current_action = [0, 0] * len(nodes)
# initialize learning parameters
self.learn_parameters = {"epsilon": epsilon, "alpha": alpha, "gamma": gamma}
# initialize dictionary for logging
self.log = {"updates": []}
def execute_policy(self, attack_actions, evaluation=False):
""" Choose the action to perform based on the policy, the exploration
scheme and the presence of attackers.
This function is used both during training, when no adversaries can be
present, and evaluation, when no exploration should take place.
Note: the policy is defined over the action space of defenders. Except
for minimaxQ, where some nodes are defenders and some opponents,
all nodes are defenders. Do not confuse adversaries (during evaluation)
with minimaxQ opponents (during training).
Args:
attack_actions (dict of int: int): has the from abs_idx: relative actions
evaluation (bool): indicates whether the policy is currently being
deployed, in which case no exploration takes place
Returns:
a list of actions to execute, where indexes are absolute
"""
# ----- epsilon-greedy policy -----
x = random.uniform(0, 1)
if x < self.learn_parameters["epsilon"] and not evaluation:
# perform random move
self.current_action = []
for idx, node in enumerate(self.nodes):
if node in self.control_nodes:
self.current_action.append(random.randint(0, 1)) # execute action
self.current_action.append(
random.randint(0, len(node.neighbors) - 1)) # off-load action
else: # perform action based on current policy
self.onpolicy_action()
# ----- replace actions of attacked nodes -----
control_abs_idxs = [node.idx for node in self.control_nodes]
for key, value in attack_actions.items():
# map to absolute node idx
abs_key = self.nodes[key].idx
if abs_key in control_abs_idxs:
# map to position in current_action
rel_key = control_abs_idxs.index(abs_key)
self.current_action[rel_key * 2] = value[0]
self.current_action[rel_key * 2 + 1] = value[1]
# map off-load action from relative idxs to absolute idxs
abs_action = copy.copy(self.current_action)
for idx, node in enumerate(self.control_nodes):
neighbors = node.neighbors
off_action = self.current_action[idx * 2 + 1]
abs_action[idx * 2 + 1] = neighbors[off_action]
return abs_action
def update_qvalue(self, reward, next_state, def_action):
""" Updates a value in the Q-table of the agent.
Note: the value of the target policy is not computed by this class. The
agent needs to be informed about the actions that it executed in the
previous step, in case it had chosen an invalid action (e.g. execute a
task when the state is 0).
Args:
reward (list of float): contains individual node rewards
next_state (list of int): contains individual node loads
def_action (list of int): actions executed in the previous step
"""
# get current Qvalue
self.current_action = def_action
current_entry = tuple(self.current_state + self.current_action)
Qcurrent = self.Qtable[current_entry]
# compute value of target policy
target = self.compute_target(next_state)
# update Q-value
td_error = sum(reward) + self.learn_parameters["gamma"] * target - Qcurrent
self.Qtable[current_entry] = Qcurrent +\
self.learn_parameters["alpha"] * td_error
def update(self, reward, next_state, def_action=[],
opponent_action=[], learn=True):
""" Updates an agent after interaction with the environment.
Args:
reward (list of float): contains individual node rewards
next_state (list of int): contains individual node loads
def_action (list of int): contains actions of defenders
opponent_action (list of int): contains actions of opponents
learn (bool): indicates whether the Q-table will be updated
"""
pass
def compute_target(self, next_state):
""" Computes the value of the target policy in the temporal difference
learning update.
"""
pass
def onpolicy_action(self, deployment):
""" Returns the greedy action according to the current policy ."""
pass