-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathBS_brain.py
1451 lines (1230 loc) · 79 KB
/
BS_brain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# implement the DQN and GNN in Each D2D
# here we use D2D and V2V interchangeably
import numpy as np
import pickle
import keras
from keras.layers import Input, Dense, Layer, activations, Lambda, add
from keras.models import Model
from keras import backend as K
import tensorflow as tf
import datetime
import os
import random
# Define GNN layer
class GNNLayer(Layer):
def __init__(self, output_dim, activation=None, **kwargs):
self.output_dim = output_dim
self.activation = activations.get(activation)
super(GNNLayer, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.W1 = self.add_weight(name='W1',
shape=(input_shape[0][1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
self.W2 = self.add_weight(name='W2',
shape=(input_shape[1][1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
self.W3 = self.add_weight(name='W3',
shape=(input_shape[2][1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
self.b = self.add_weight(name='bias',
shape=(self.output_dim,),
initializer='zeros',
trainable=True)
super(GNNLayer, self).build(input_shape) # Be sure to call this at the end
def call(self, x):
assert isinstance(x, list)
a, b, c = x
output = K.dot(a, self.W1)+K.dot(b, self.W2)+K.dot(c, self.W3)
output = K.bias_add(output, self.b, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
def compute_output_shape(self, input_shape):
assert isinstance(input_shape, list)
shape_a, shape_b, shape_c = input_shape
return (shape_a[0], self.output_dim)
# Define Aggregate layer
class AggLayer(Layer):
def __init__(self, output_dim, **kwargs):
self.output_dim = output_dim
super(AggLayer, self).__init__(**kwargs)
def build(self, input_shape):
super(AggLayer, self).build(input_shape) # Be sure to call this at the end
def call(self, x):
assert isinstance(x, list)
D1, D2, D3, D4, A = x
D = K.concatenate([D1, D2, D3, D4], axis=-1)
output = K.batch_dot(D, A, axes=[1, 1])
return [output[:, :self.output_dim], output[:, self.output_dim:2*self.output_dim],
output[:, 2*self.output_dim:3*self.output_dim], output[:, 3*self.output_dim:]]
def compute_output_shape(self, input_shape):
assert isinstance(input_shape, list)
shape_D1, shape_D2, shape_D3, shape_D4, shape_A = input_shape
return [(shape_A[0], self.output_dim), (shape_A[0], self.output_dim),
(shape_A[0], self.output_dim), (shape_A[0], self.output_dim)]
# define huber loss
def huber_loss(y_true, y_pred):
return tf.losses.huber_loss(y_true, y_pred)
class BS:
"""
Define the BS DNN class
"""
def __init__(self, num_d2d, input_node_info, input_edge_info, num_d2d_feedback, num_d2d_neighbor, num_ch):
self.num_D2D = num_d2d
self.num_Neighbor = num_d2d_neighbor # The number of D2D pairs o_k contains (always 1 in our case)
self.num_CH = num_ch # total channels
self.num_Feedback = num_d2d_feedback # each D2D GNN extracted feature size
self.input_node_Info = input_node_info # length of the observation of a D2D pair (node)
self.input_edge_Info = input_edge_info # length of the weight of a link (edge)
self.num_One_Node_Input = ((input_node_info - 1)*self.num_CH + 1)*self.num_Neighbor # each node observation size
self.num_One_Edge_Input = input_edge_info*self.num_CH # each edge observation size
self.num_One_D2D_Input = self.num_One_Node_Input+self.num_One_Edge_Input # Each D2D compression inputs
self.num_D2D_Input = num_d2d*self.num_One_D2D_Input+self.num_D2D**2 # Total D2D compression inputs
self.model = self._create_model()
self.target_model = self._create_model()
def _create_model(self):
# construct D2D GNN and DNN together
Num_Node_Input = self.num_One_Node_Input
Num_Edge_Input = self.num_One_Edge_Input
Num_D2D_Output = self.num_Feedback
Num_D2D = self.num_D2D
# implement GNN for 1st D2D
D1_Node_Input = Input(shape=(Num_Node_Input,), name='D1_Node_Input')
D1_Edge_Input = Input(shape=(Num_Edge_Input,), name='D1_Edge_Input')
D1_Neighbor_Input = Input(shape=(Num_D2D_Output,), name='D1_Neighbor_Input')
# Share Weights for different GNN layers of each D2D
GNN1 = GNNLayer(Num_D2D_Output, activation='relu', name='D1_GNN')
# implement GNN for 2nd D2D
D2_Node_Input = Input(shape=(Num_Node_Input,), name='D2_Node_Input')
D2_Edge_Input = Input(shape=(Num_Edge_Input,), name='D2_Edge_Input')
D2_Neighbor_Input = Input(shape=(Num_D2D_Output,), name='D2_Neighbor_Input')
# Share Weights for different GNN layers of each D2D
GNN2 = GNNLayer(Num_D2D_Output, activation='relu', name='D2_GNN')
# implement GNN for 3rd D2D
D3_Node_Input = Input(shape=(Num_Node_Input,), name='D3_Node_Input')
D3_Edge_Input = Input(shape=(Num_Edge_Input,), name='D3_Edge_Input')
D3_Neighbor_Input = Input(shape=(Num_D2D_Output,), name='D3_Neighbor_Input')
# Share Weights for different GNN layers of each D2D
GNN3 = GNNLayer(Num_D2D_Output, activation='relu', name='D3_GNN')
# implement GNN for 4th D2D
D4_Node_Input = Input(shape=(Num_Node_Input,), name='D4_Node_Input')
D4_Edge_Input = Input(shape=(Num_Edge_Input,), name='D4_Edge_Input')
D4_Neighbor_Input = Input(shape=(Num_D2D_Output,), name='D4_Neighbor_Input')
# Share Weights for different GNN layers of each D2D
GNN4 = GNNLayer(Num_D2D_Output, activation='relu', name='D4_GNN')
A = Input(shape=(Num_D2D*Num_D2D_Output, Num_D2D*Num_D2D_Output), name='Adjacency_Matrix')
Aggregate = AggLayer(Num_D2D_Output, name='Aggregate')
D1 = GNN1([D1_Node_Input, D1_Edge_Input, D1_Neighbor_Input])
D2 = GNN2([D2_Node_Input, D2_Edge_Input, D2_Neighbor_Input])
D3 = GNN3([D3_Node_Input, D3_Edge_Input, D3_Neighbor_Input])
D4 = GNN4([D4_Node_Input, D4_Edge_Input, D4_Neighbor_Input])
[D1_Agg, D2_Agg, D3_Agg, D4_Agg] = Aggregate([D1, D2, D3, D4, A])
D1 = GNNLayer(Num_D2D_Output, activation='relu')([keras.layers.concatenate([D1, D1_Node_Input]), D1_Edge_Input, D1_Agg])
D2 = GNNLayer(Num_D2D_Output, activation='relu')([keras.layers.concatenate([D2, D2_Node_Input]), D2_Edge_Input, D2_Agg])
D3 = GNNLayer(Num_D2D_Output, activation='relu')([keras.layers.concatenate([D3, D3_Node_Input]), D3_Edge_Input, D3_Agg])
D4 = GNNLayer(Num_D2D_Output, activation='relu')([keras.layers.concatenate([D4, D4_Node_Input]), D4_Edge_Input, D4_Agg])
[D1_Agg, D2_Agg, D3_Agg, D4_Agg] = Aggregate([D1, D2, D3, D4, A])
D1 = GNNLayer(Num_D2D_Output)([keras.layers.concatenate([D1, D1_Node_Input]), D1_Edge_Input, D1_Agg])
D2 = GNNLayer(Num_D2D_Output)([keras.layers.concatenate([D2, D2_Node_Input]), D2_Edge_Input, D2_Agg])
D3 = GNNLayer(Num_D2D_Output)([keras.layers.concatenate([D3, D3_Node_Input]), D3_Edge_Input, D3_Agg])
D4 = GNNLayer(Num_D2D_Output)([keras.layers.concatenate([D4, D4_Node_Input]), D4_Edge_Input, D4_Agg])
[D1_Agg, D2_Agg, D3_Agg, D4_Agg] = Aggregate([D1, D2, D3, D4, A])
D1_output = keras.layers.concatenate([D1, D1_Agg])
D2_output = keras.layers.concatenate([D2, D2_Agg])
D3_output = keras.layers.concatenate([D3, D3_Agg])
D4_output = keras.layers.concatenate([D4, D4_Agg])
# implement Decision DNN for 1st D2D
Num_D2D_Decide_Output = self.num_CH
D1_Decide_Input = keras.layers.concatenate([D1_Node_Input, D1_output], name='D1_Decide_Input')
D1_Decide = Dense(80, activation='relu')(D1_Decide_Input)
D1_Decide = Dense(40, activation='relu')(D1_Decide)
D1_Decide = Dense(20, activation='relu')(D1_Decide)
D1_Decide_Output = Dense(Num_D2D_Decide_Output, activation='linear', name='D1_Decide_Output')(D1_Decide)
# implement Decision DNN for 2nd D2D
D2_Decide_Input = keras.layers.concatenate([D2_Node_Input, D2_output], name='D2_Decide_Input')
D2_Decide = Dense(80, activation='relu')(D2_Decide_Input)
D2_Decide = Dense(40, activation='relu')(D2_Decide)
D2_Decide = Dense(20, activation='relu')(D2_Decide)
D2_Decide_Output = Dense(Num_D2D_Decide_Output, activation='linear', name='D2_Decide_Output')(D2_Decide)
# implement Decision DNN for 3rd D2D
D3_Decide_Input = keras.layers.concatenate([D3_Node_Input, D3_output], name='D3_Decide_Input')
D3_Decide = Dense(80, activation='relu')(D3_Decide_Input)
D3_Decide = Dense(40, activation='relu')(D3_Decide)
D3_Decide = Dense(20, activation='relu')(D3_Decide)
D3_Decide_Output = Dense(Num_D2D_Decide_Output, activation='linear', name='D3_Decide_Output')(D3_Decide)
# implement Decision DNN for 4th D2D
D4_Decide_Input = keras.layers.concatenate([D4_Node_Input, D4_output], name='D4_Decide_Input')
D4_Decide = Dense(80, activation='relu')(D4_Decide_Input)
D4_Decide = Dense(40, activation='relu')(D4_Decide)
D4_Decide = Dense(20, activation='relu')(D4_Decide)
D4_Decide_Output = Dense(Num_D2D_Decide_Output, activation='linear', name='D4_Decide_Output')(D4_Decide)
# Define the model
model = Model(inputs=[D1_Node_Input, D1_Edge_Input, D1_Neighbor_Input,
D2_Node_Input, D2_Edge_Input, D2_Neighbor_Input,
D3_Node_Input, D3_Edge_Input, D3_Neighbor_Input,
D4_Node_Input, D4_Edge_Input, D4_Neighbor_Input,
A],
outputs=[D1_Decide_Output, D2_Decide_Output, D3_Decide_Output, D4_Decide_Output])
# the default value of learning rate lr=0.001, change it if necessary
# rms = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.5, beta_2=0.999)
# Compile the model
model.compile(optimizer=adam, loss=huber_loss)
return model
def train_dnn(self, data_train, labels, batch_size):
# training method for DNNs
epochs = 1
Train_Result = self.model.fit(data_train, labels, batch_size=batch_size, epochs=epochs, verbose=0)
return Train_Result
def predict(self, data_test, target=False):
# predict the value
# target: True -> choose the target network; otherwise, choose the Q-function network
if target:
return self.target_model.predict(data_test)
else:
return self.model.predict(data_test)
def predict_one_step(self, data_test, target=False):
# one step predict
return self.predict(data_test, target=target)
def update_target_model(self):
# use current model weights to update target network
self.target_model.set_weights(self.model.get_weights())
# -------------------- MEMORY --------------------------
# define the memory class to replay
class Memory: # stored as ( s, a, r, s_ )
samples = []
def __init__(self, capacity):
self.capacity = capacity
# add sample
def add(self, sample):
self.samples.append(sample)
if len(self.samples) > self.capacity:
self.samples.pop(0)
def sample(self, n):
# choose the n samples from the memory
if len(self.samples) >= n:
Samples_Indices = np.random.choice(len(self.samples), n, replace=False)
Batch_Samples = np.array(self.samples)[Samples_Indices]
return Batch_Samples
else:
# repeated sample the current samples until we get n samples
Batch_Samples = []
while len(Batch_Samples) < n:
index = np.random.randint(0, len(self.samples))
Batch_Samples.append(self.samples[index])
return Batch_Samples
# ------------------------AGENT--------------------
MEMORY_CAPACITY = 1000000
UPDATE_TARGET_FREQUENCY = 500
MAX_EPSILON = 1
MIN_EPSILON = 0.01 # epsilon-greedy: the exploration rate epsilon decreases from MAX_EPSILON to MIN_EPSILON
class Agent:
"""
Define the BS Agent class
"""
def __init__(self, num_d2d, num_ch, num_neighbor, num_d2d_feedback, environment, curr_rl_config):
self.epsilon = MAX_EPSILON
self.num_step = 0
self.num_CH = num_ch
self.num_D2D = num_d2d
self.num_Neighbor = num_neighbor
self.num_Feedback = num_d2d_feedback
self.memory = Memory(MEMORY_CAPACITY)
# D2D inputs: V2V channel gain on all channels, Interference on all channels (other V2V and V2I),
# V2I channel gain on all channels, transmit power
self.input_Node_Info = 3
self.input_Edge_Info = 1
self.env = environment
self.brain = BS(self.num_D2D, self.input_Node_Info, self.input_Edge_Info, self.num_Feedback,
self.num_Neighbor, self.num_CH)
self.num_States = self.brain.num_D2D_Input # total D2D compression input
self.num_Actions = self.num_CH*self.num_Neighbor
self.action_all_with_power = np.zeros([self.num_D2D, self.num_Neighbor, self.num_CH], dtype='int32')
self.action_all_with_power_training = np.zeros([self.num_D2D, self.num_Neighbor, self.num_CH], dtype='int32')
self.batch_size = curr_rl_config.Batch_Size
self.gamma = curr_rl_config.Gamma
self.v2v_weight = curr_rl_config.v2v_weight
self.v2i_weight = curr_rl_config.v2i_weight
def select_action_while_training(self, state):
# according to current state, choose the proper action
num_D2D = self.num_D2D
num_neighbor = self.num_Neighbor
Action_Matrix = 100*np.ones((num_D2D, num_neighbor))
CH_Set = range(0, self.num_CH)
# anneal Epsilon linearly from MAX_EPSILON to MIN_EPSILON
Epsilon_decrease_percentage = 0.8
Epsilon_decrease_Episode = self.num_Episodes * Epsilon_decrease_percentage
Epsilon_decrease_Steps = Epsilon_decrease_Episode * self.num_Train_Step * self.num_transition
# num_transition: make several transitions before each training
Epsilon_decrease_per_Step = (MAX_EPSILON - MIN_EPSILON) / Epsilon_decrease_Steps
if self.num_step < Epsilon_decrease_Steps:
self.epsilon = MAX_EPSILON - Epsilon_decrease_per_Step*self.num_step
else:
self.epsilon = MIN_EPSILON
# track the training process
if self.num_step % 50000 == 0:
print('Current Epsilon while Training is ', self.epsilon, ' Current Training Step is ', self.num_step)
if np.random.random() < self.epsilon:
# generate action for each D2D randomly
for D2D_loop in range(num_D2D):
Action_Matrix[D2D_loop, :] = np.random.choice(CH_Set, num_neighbor)
else:
# choose the action index which maximize the Q Function of each D2D
Q_Pred = self.brain.predict_one_step(state, target=False)
D2D_Action = np.zeros((self.num_D2D, num_neighbor), int)
# get the action for each D2D from each D2D's DQN
for D_loop in range(self.num_D2D):
# use the current Q function to predict the max action
Action_Index = np.where(Q_Pred[D_loop][0] == np.max(Q_Pred[D_loop][0]))
if len(Action_Index) == 1:
D2D_Action[D_loop] = Action_Index[0][0]
else:
# when there are two actions leading to the same reward, we just choose one of them
D2D_Action[D_loop] = Action_Index[0]
print('While Training: Current Q Predict is', Q_Pred[D_loop][0], 'at the -', D_loop, '-D2D')
print(' Current Action is ', Action_Index)
Action_Matrix = D2D_Action
# return Action_Matrix
return Action_Matrix.astype(np.int)
def select_action_random(self, state):
# choose the action Randomly
num_D2D = self.num_D2D
num_neighbor = self.num_Neighbor
Action_Matrix = 100*np.ones((num_D2D, num_neighbor))
CH_Set = range(0, self.num_CH)
# generate action for each D2D randomly
for D2D_loop in range(num_D2D):
Action_Matrix[D2D_loop, :] = np.random.choice(CH_Set, num_neighbor)
return Action_Matrix.astype(np.int)
def act(self, actions):
# agent executes the action
# update current time step
self.num_step += 1
# take actions and get reward
[V2V_Rate, V2I_Rate, Interference] = self.env.compute_reward_with_channel_selection(actions)
# update the state
self.env.renew_positions()
self.env.renew_channels_fastfading()
self.env.Compute_Interference(actions)
return V2V_Rate, V2I_Rate, Interference
def dump_act(self, actions):
# take actions but do not update the state
# use for the comparing schemes, such as, random action scheme, ...
# take actions and get reward
[V2V_Rate, V2I_Rate, Interference] = self.env.compute_reward_with_channel_selection(actions)
return V2V_Rate, V2I_Rate, Interference
def train_observe(self, sample):
# Collect Data in (s, a, r, s_) format for training
self.memory.add(sample)
def get_state(self, idx):
# get state from the environment
# Input: idx[0] = target vehicle index, idx[1] = neighbor or receiver index
# to normalize channel gain and interference to a reasonable range
Constant_A = 80
Constant_B = 60
V2V_channel = (self.env.V2V_channels_with_fastfading[idx[0],
self.env.vehicles[idx[0]].destinations[idx[1]], :] - Constant_A)/Constant_B
V2I_channel = (self.env.V2I_channels_with_fastfading[idx[0], :] - Constant_A)/Constant_B
V2V_edge = (((np.sum(self.env.V2V_channels_with_fastfading[:,
self.env.vehicles[idx[0]].destinations[idx[1]], :], axis=0)
- self.env.V2V_channels_with_fastfading[self.env.vehicles[idx[0]].destinations[idx[1]],
self.env.vehicles[idx[0]].destinations[idx[1]], :]
)
- (self.num_D2D-1)*Constant_A)/Constant_B - V2V_channel)/(self.num_D2D-2)
return V2V_channel, V2I_channel, V2V_edge
def generate_d2d_transition(self, num_transitions):
# take action via the Epsilon-Greedy strategy, observe the transition (S, A, R, S_) while training,
# then add this transition to the Buffer
self.train_step = 0
self.random_action = False
Num_One_D2D_Input = self.brain.num_One_D2D_Input
Num_One_Node_Input = self.brain.num_One_Node_Input
Num_One_Edge_Input = self.brain.num_One_Edge_Input
Num_Neighbor_Input = self.brain.num_Feedback
CH_gain_Index = self.num_Neighbor * self.num_CH
CH_edge_Index = self.num_Neighbor * self.num_CH
# record the reward per transitions
Reward_Per_Transition = np.zeros(num_transitions)
# weight for the V2V sum rate
v2v_weight = self.v2v_weight
# weight for the V2I sum rate
v2i_weight = self.v2i_weight
# normalize rate for V2V rate if necessary
V2V_Rate_max = 1
# generate num_transitions of transitions
for self.train_step in range(num_transitions):
# initialize temp variables
if self.train_step == 0:
Train_D2D_CH_State = np.zeros((self.num_D2D, self.num_Neighbor, self.num_CH))
Train_D2D_edge_State = np.zeros((self.num_D2D, self.num_Neighbor, self.num_CH))
Train_D2D_V2I_CH_State = np.zeros((self.num_D2D, self.num_CH))
Fixed_Power = self.env.V2V_power_dB_List[self.env.fixed_v2v_power_index]
Train_D2D_Power_State = Fixed_Power * np.ones((self.num_D2D, self.num_Neighbor))
# Get all D2D channel, interference state and adjacency matrix for training, the real adjacency matrix (no transpose)
Adjacency_Matrix = np.ones((self.num_D2D, self.num_D2D))-np.eye(self.num_D2D)
for D2D_loop in range(self.num_D2D):
for veh_loop in range(self.num_D2D):
if D2D_loop == self.env.vehicles[veh_loop].destinations[0]:
Adjacency_Matrix[D2D_loop, veh_loop] = 0
for D2D_loop in range(self.num_D2D):
for Neighbor_loop in range(self.num_Neighbor):
# Input: index[0] = target vehicle index, index[1] = neighbor index
index = [D2D_loop, Neighbor_loop]
[V2V_channel, V2I_channel, V2V_edge] = self.get_state(index)
Train_D2D_CH_State[D2D_loop, Neighbor_loop, :] = V2V_channel
Train_D2D_edge_State[D2D_loop, Neighbor_loop, :] = V2V_edge
Train_D2D_V2I_CH_State[D2D_loop, :] = V2I_channel
# reshape the training data in (S, A, R, S_)
# reshape the States for all D2D
D2D_State = np.zeros((self.num_D2D, Num_One_D2D_Input))
for D2D_loop in range(self.num_D2D):
Current_CH_gain = np.reshape(Train_D2D_CH_State[D2D_loop, :, :], [1, CH_gain_Index])
D2D_State[D2D_loop, 0:CH_gain_Index] = Current_CH_gain
Current_V2I_gain = Train_D2D_V2I_CH_State[D2D_loop, :]
D2D_State[D2D_loop, CH_gain_Index:2*CH_gain_Index] = Current_V2I_gain
D2D_State[D2D_loop, 2*CH_gain_Index:2*CH_gain_Index+self.num_Neighbor] = Train_D2D_Power_State[D2D_loop, :]
Current_edge_gain = np.reshape(Train_D2D_edge_State[D2D_loop, :, :], [1, CH_edge_Index])
D2D_State[D2D_loop, 2*CH_gain_Index+self.num_Neighbor:] = Current_edge_gain
States = np.concatenate((np.reshape(D2D_State, [1, -1]), np.reshape(Adjacency_Matrix, [1, -1])), axis=-1)
# two different action selection strategies: Random Selection and Epsilon-Greedy Strategy
if self.random_action:
# Choose the actions randomly
Train_D2D_Action_Matrix = self.select_action_random(States)
else:
D1_State_Node = np.reshape(D2D_State[0, :Num_One_Node_Input], [1, Num_One_Node_Input])
D1_State_Edge = np.reshape(D2D_State[0, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D1_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
D2_State_Node = np.reshape(D2D_State[1, :Num_One_Node_Input], [1, Num_One_Node_Input])
D2_State_Edge = np.reshape(D2D_State[1, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D2_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
D3_State_Node = np.reshape(D2D_State[2, :Num_One_Node_Input], [1, Num_One_Node_Input])
D3_State_Edge = np.reshape(D2D_State[2, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D3_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
D4_State_Node = np.reshape(D2D_State[3, :Num_One_Node_Input], [1, Num_One_Node_Input])
D4_State_Edge = np.reshape(D2D_State[3, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D4_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
A_Matrix = np.reshape(np.kron(Adjacency_Matrix, np.eye(Num_Neighbor_Input)),
[1, self.num_D2D*Num_Neighbor_Input, self.num_D2D*Num_Neighbor_Input])
States_train = {'D1_Node_Input': D1_State_Node, 'D1_Edge_Input': D1_State_Edge,
'D1_Neighbor_Input': D1_State_Neighbor,
'D2_Node_Input': D2_State_Node, 'D2_Edge_Input': D2_State_Edge,
'D2_Neighbor_Input': D2_State_Neighbor,
'D3_Node_Input': D3_State_Node, 'D3_Edge_Input': D3_State_Edge,
'D3_Neighbor_Input': D3_State_Neighbor,
'D4_Node_Input': D4_State_Node, 'D4_Edge_Input': D4_State_Edge,
'D4_Neighbor_Input': D4_State_Neighbor,
'Adjacency_Matrix': A_Matrix
}
# choose action via Epsilon-Greedy strategy
Train_D2D_Action_Matrix = self.select_action_while_training(States_train)
Actions = np.reshape(Train_D2D_Action_Matrix, [1, -1])
# Take action and Get Reward
V2V_Rate, V2I_Rate, Interference = self.act(Train_D2D_Action_Matrix)
Train_D2D_Reward = np.sum(V2V_Rate, axis=1)
Train_BS_Reward = np.sum(Train_D2D_Reward)
Sum_V2I_Rate = np.sum(V2I_Rate)
Norm_BS_Reward = Train_BS_Reward/V2V_Rate_max
Reward = v2v_weight*Norm_BS_Reward + v2i_weight*Sum_V2I_Rate
# record the current reward
Reward_Per_Transition[self.train_step] = Reward
# Get NEXT state: all D2D channel and interference state for training
Next_Train_D2D_CH_State = np.zeros((self.num_D2D, self.num_Neighbor, self.num_CH))
Next_Train_D2D_edge_State = np.zeros((self.num_D2D, self.num_Neighbor, self.num_CH))
Next_Train_D2D_V2I_CH_State = np.zeros((self.num_D2D, self.num_CH))
for D2D_loop in range(self.num_D2D):
for Neighbor_loop in range(self.num_Neighbor):
# Input: indx[0] = target vehicle index, indx[1] = neighbor index
index = [D2D_loop, Neighbor_loop]
[V2V_channel, V2I_channel, V2V_edge] = self.get_state(index)
Next_Train_D2D_CH_State[D2D_loop, Neighbor_loop, :] = V2V_channel
Next_Train_D2D_edge_State[D2D_loop, Neighbor_loop, :] = V2V_edge
Next_Train_D2D_V2I_CH_State[D2D_loop, :] = V2I_channel
D2D_Next_State = np.zeros((self.num_D2D, Num_One_D2D_Input))
for D2D_loop in range(self.num_D2D):
Current_CH_gain = np.reshape(Next_Train_D2D_CH_State[D2D_loop, :, :], [1, CH_gain_Index])
D2D_Next_State[D2D_loop, 0:CH_gain_Index] = Current_CH_gain
Current_V2I_gain = Next_Train_D2D_V2I_CH_State[D2D_loop, :]
D2D_Next_State[D2D_loop, CH_gain_Index: 2*CH_gain_Index] = Current_V2I_gain
D2D_Next_State[D2D_loop, 2*CH_gain_Index: 2*CH_gain_Index+self.num_Neighbor] = Train_D2D_Power_State[D2D_loop, :]
Current_edge_gain = np.reshape(Next_Train_D2D_edge_State[D2D_loop, :, :], [1, CH_edge_Index])
D2D_Next_State[D2D_loop, 2*CH_gain_Index+self.num_Neighbor:] = Current_edge_gain
States_ = np.concatenate((np.reshape(D2D_Next_State, [1, -1]), np.reshape(Adjacency_Matrix, [1, -1])), axis=-1)
# sample in (s, a, r, s_) format for whole GNN-RL model
sample = [States, Actions, Reward, States_]
# add the sample (or transition) to the Buffer
self.train_observe(sample)
return Reward_Per_Transition
def replay(self):
# define the replay to generate training samples from Memory
Num_RL_Actions = self.num_Actions
Num_D2D = self.num_D2D
Num_One_D2D_Input = self.brain.num_One_D2D_Input
Num_One_Node_Input = self.brain.num_One_Node_Input
Num_One_Edge_Input = self.brain.num_One_Edge_Input
Num_Neighbor_Input = self.brain.num_Feedback
BATCH_SIZE = self.batch_size
GAMMA = self.gamma
# read samples from memory
batch = self.memory.sample(BATCH_SIZE)
batchLen = len(batch)
test_states = np.zeros((batchLen, Num_D2D, Num_One_D2D_Input))
test_adjacency_matrix = np.zeros((batchLen, Num_D2D, Num_D2D))
test_states_ = np.zeros((batchLen, Num_D2D, Num_One_D2D_Input))
test_adjacency_matrix_ = np.zeros((batchLen, Num_D2D, Num_D2D))
for Sample_loop in range(batchLen):
test_states[Sample_loop, :, :] = np.reshape(batch[Sample_loop][0][0, :Num_D2D*Num_One_D2D_Input],
[Num_D2D, Num_One_D2D_Input])
test_adjacency_matrix[Sample_loop, :, :] = np.reshape(batch[Sample_loop][0][0, Num_D2D*Num_One_D2D_Input:],
[Num_D2D, Num_D2D])
if batch[Sample_loop][3] is None:
test_states_[Sample_loop, :, :] = np.zeros((Num_D2D, Num_One_D2D_Input))
else:
test_states_[Sample_loop, :, :] = np.reshape(batch[Sample_loop][3][0, :Num_D2D*Num_One_D2D_Input],
[Num_D2D, Num_One_D2D_Input])
test_adjacency_matrix_ = test_adjacency_matrix
Num_Batch = batchLen
# initialize states
D1_State_Node = np.zeros([Num_Batch, Num_One_Node_Input])
D1_State_Edge = np.zeros([Num_Batch, Num_One_Edge_Input])
D1_State_Neighbor = np.zeros([Num_Batch, Num_Neighbor_Input])
D2_State_Node = np.zeros([Num_Batch, Num_One_Node_Input])
D2_State_Edge = np.zeros([Num_Batch, Num_One_Edge_Input])
D2_State_Neighbor = np.zeros([Num_Batch, Num_Neighbor_Input])
D3_State_Node = np.zeros([Num_Batch, Num_One_Node_Input])
D3_State_Edge = np.zeros([Num_Batch, Num_One_Edge_Input])
D3_State_Neighbor = np.zeros([Num_Batch, Num_Neighbor_Input])
D4_State_Node = np.zeros([Num_Batch, Num_One_Node_Input])
D4_State_Edge = np.zeros([Num_Batch, Num_One_Edge_Input])
D4_State_Neighbor = np.zeros([Num_Batch, Num_Neighbor_Input])
Adjacency_Matrix = np.kron(test_adjacency_matrix, np.eye(Num_Neighbor_Input))
# initialize next states
D1_State_Node_ = np.zeros([Num_Batch, Num_One_Node_Input])
D1_State_Edge_ = np.zeros([Num_Batch, Num_One_Edge_Input])
D1_State_Neighbor_ = np.zeros([Num_Batch, Num_Neighbor_Input])
D2_State_Node_ = np.zeros([Num_Batch, Num_One_Node_Input])
D2_State_Edge_ = np.zeros([Num_Batch, Num_One_Edge_Input])
D2_State_Neighbor_ = np.zeros([Num_Batch, Num_Neighbor_Input])
D3_State_Node_ = np.zeros([Num_Batch, Num_One_Node_Input])
D3_State_Edge_ = np.zeros([Num_Batch, Num_One_Edge_Input])
D3_State_Neighbor_ = np.zeros([Num_Batch, Num_Neighbor_Input])
D4_State_Node_ = np.zeros([Num_Batch, Num_One_Node_Input])
D4_State_Edge_ = np.zeros([Num_Batch, Num_One_Edge_Input])
D4_State_Neighbor_ = np.zeros([Num_Batch, Num_Neighbor_Input])
Adjacency_Matrix_ = np.kron(test_adjacency_matrix_, np.eye(Num_Neighbor_Input))
for Sample_loop in range(batchLen):
D1_State_Node[Sample_loop, :] = test_states[Sample_loop, 0, :Num_One_Node_Input]
D1_State_Edge[Sample_loop, :] = test_states[Sample_loop, 0, Num_One_Node_Input:]
D2_State_Node[Sample_loop, :] = test_states[Sample_loop, 1, :Num_One_Node_Input]
D2_State_Edge[Sample_loop, :] = test_states[Sample_loop, 1, Num_One_Node_Input:]
D3_State_Node[Sample_loop, :] = test_states[Sample_loop, 2, :Num_One_Node_Input]
D3_State_Edge[Sample_loop, :] = test_states[Sample_loop, 2, Num_One_Node_Input:]
D4_State_Node[Sample_loop, :] = test_states[Sample_loop, 3, :Num_One_Node_Input]
D4_State_Edge[Sample_loop, :] = test_states[Sample_loop, 3, Num_One_Node_Input:]
D1_State_Node_[Sample_loop, :] = test_states_[Sample_loop, 0, :Num_One_Node_Input]
D1_State_Edge_[Sample_loop, :] = test_states_[Sample_loop, 0, Num_One_Node_Input:]
D2_State_Node_[Sample_loop, :] = test_states_[Sample_loop, 1, :Num_One_Node_Input]
D2_State_Edge_[Sample_loop, :] = test_states_[Sample_loop, 1, Num_One_Node_Input:]
D3_State_Node_[Sample_loop, :] = test_states_[Sample_loop, 2, :Num_One_Node_Input]
D3_State_Edge_[Sample_loop, :] = test_states_[Sample_loop, 2, Num_One_Node_Input:]
D4_State_Node_[Sample_loop, :] = test_states_[Sample_loop, 3, :Num_One_Node_Input]
D4_State_Edge_[Sample_loop, :] = test_states_[Sample_loop, 3, Num_One_Node_Input:]
states = {'D1_Node_Input': D1_State_Node, 'D1_Edge_Input': D1_State_Edge,
'D1_Neighbor_Input': D1_State_Neighbor,
'D2_Node_Input': D2_State_Node, 'D2_Edge_Input': D2_State_Edge,
'D2_Neighbor_Input': D2_State_Neighbor,
'D3_Node_Input': D3_State_Node, 'D3_Edge_Input': D3_State_Edge,
'D3_Neighbor_Input': D3_State_Neighbor,
'D4_Node_Input': D4_State_Node, 'D4_Edge_Input': D4_State_Edge,
'D4_Neighbor_Input': D4_State_Neighbor,
'Adjacency_Matrix': Adjacency_Matrix
}
states_ = {'D1_Node_Input': D1_State_Node_, 'D1_Edge_Input': D1_State_Edge_,
'D1_Neighbor_Input': D1_State_Neighbor_,
'D2_Node_Input': D2_State_Node_, 'D2_Edge_Input': D2_State_Edge_,
'D2_Neighbor_Input': D2_State_Neighbor_,
'D3_Node_Input': D3_State_Node_, 'D3_Edge_Input': D3_State_Edge_,
'D3_Neighbor_Input': D3_State_Neighbor_,
'D4_Node_Input': D4_State_Node_, 'D4_Edge_Input': D4_State_Edge_,
'D4_Neighbor_Input': D4_State_Neighbor_,
'Adjacency_Matrix': Adjacency_Matrix_
}
p = self.brain.predict(states) # Q-function network
p_ = self.brain.predict(states_, target=True) # target network
# initialize the target value of Q-function -
y = np.zeros((Num_D2D, batchLen, Num_RL_Actions))
for batch_Loop in range(batchLen):
# fetch current sample(observation) from Replay Buffer
# observation = {S, A, R, S_}
o = batch[batch_Loop]
s = o[0] # get current state
a = o[1] # get current action
r = o[2] # get current reward
s_ = o[3] # get the NEXT state
# here each D2D has a Q function, and train these Q functions together via using the global reward
for D_loop in range(Num_D2D):
# get current action index
a_RL = a[0][D_loop]
# get the prediction of current D2D's Q function
t = p[D_loop][batch_Loop]
if s_ is None:
t[a_RL] = r
else:
# use target network to evaluate Q(s,a) value
# use the current D2D's target networks
t[a_RL] = r + GAMMA * np.amax(p_[D_loop][batch_Loop])
y[D_loop][batch_Loop] = t
D1_Data_Node_Train = D1_State_Node
D1_Data_Edge_Train = D1_State_Edge
D1_Data_Neighbor_Train = D1_State_Neighbor
D2_Data_Node_Train = D2_State_Node
D2_Data_Edge_Train = D2_State_Edge
D2_Data_Neighbor_Train = D2_State_Neighbor
D3_Data_Node_Train = D3_State_Node
D3_Data_Edge_Train = D3_State_Edge
D3_Data_Neighbor_Train = D3_State_Neighbor
D4_Data_Node_Train = D4_State_Node
D4_Data_Edge_Train = D4_State_Edge
D4_Data_Neighbor_Train = D4_State_Neighbor
Data_Adjacency_Matrix_Train = Adjacency_Matrix
# use the current samples to train RL DNN
x = {'D1_Node_Input': D1_Data_Node_Train, 'D1_Edge_Input': D1_Data_Edge_Train,
'D1_Neighbor_Input': D1_Data_Neighbor_Train,
'D2_Node_Input': D2_Data_Node_Train, 'D2_Edge_Input': D2_Data_Edge_Train,
'D2_Neighbor_Input': D2_Data_Neighbor_Train,
'D3_Node_Input': D3_Data_Node_Train, 'D3_Edge_Input': D3_Data_Edge_Train,
'D3_Neighbor_Input': D3_Data_Neighbor_Train,
'D4_Node_Input': D4_Data_Node_Train, 'D4_Edge_Input': D4_Data_Edge_Train,
'D4_Neighbor_Input': D4_Data_Neighbor_Train,
'Adjacency_Matrix': Data_Adjacency_Matrix_Train
}
y_train = {'D1_Decide_Output': y[0], 'D2_Decide_Output': y[1],
'D3_Decide_Output': y[2], 'D4_Decide_Output': y[3]}
# train the model
Train_Result = self.brain.train_dnn(x, y_train, BATCH_SIZE)
# initialize Q mean and Q max of Target value
Q_mean = np.zeros(Num_D2D)
Q_max_mean = np.zeros(Num_D2D)
# for original Q function value
Orig_Q_mean = np.zeros(Num_D2D)
Orig_Q_max_mean = np.zeros(Num_D2D)
for D_loop in range(Num_D2D):
# calculate the target Q value
Q_batch = np.sum(y[D_loop], axis=1) / Num_RL_Actions
Q_mean[D_loop] = np.sum(Q_batch) / batchLen
Q_max_batch = np.max(y[D_loop], axis=1)
Q_max_mean[D_loop] = np.sum(Q_max_batch) / batchLen
# record the original Q function value
Orig_Q_batch = np.sum(p[D_loop], axis=1) / Num_RL_Actions
Orig_Q_mean[D_loop] = np.sum(Orig_Q_batch) / batchLen
Orig_Q_max_batch = np.max(p[D_loop], axis=1)
Orig_Q_max_mean[D_loop] = np.sum(Orig_Q_max_batch) / batchLen
return Train_Result, Q_mean, Q_max_mean, Orig_Q_mean, Orig_Q_max_mean
def train(self, num_episodes, num_train_steps):
# to train model
self.num_Episodes = num_episodes
self.num_Train_Step = num_train_steps
BATCH_SIZE = self.batch_size
GAMMA = self.gamma
Num_D2D = self.num_D2D
# make several transitions before each training
self.num_transition = 50
# record the training loss
Train_Loss = np.ones((Num_D2D, num_episodes, num_train_steps))
# record the change of Target Q function -- Y
Train_Q_mean = np.zeros((Num_D2D, num_episodes, num_train_steps))
Train_Q_max_mean = np.zeros((Num_D2D, num_episodes, num_train_steps))
# record the original Q value
Orig_Train_Q_mean = np.zeros((Num_D2D, num_episodes, num_train_steps))
Orig_Train_Q_max_mean = np.zeros((Num_D2D, num_episodes, num_train_steps))
self.num_step = 0
# record the reward per episode
Reward_Per_Episode = np.zeros(num_episodes)
Reward_Per_Train_Step = np.zeros((num_episodes, num_train_steps, self.num_transition))
# track the simulation settings
current_datetime = datetime.datetime.now()
print(current_datetime.strftime('%Y/%m/%d %H:%M:%S'))
print("Training Parameters Settings in the Train Function are: ")
print('Number of feedback: ', self.num_Feedback)
print('Discount Factor Gamma: ', GAMMA)
print('Batch Size: ', BATCH_SIZE)
print('Training Episodes: ', self.num_Episodes)
print('Train Steps per Episode: ', self.num_Train_Step)
print('V2V Rate weight: ', self.v2v_weight)
print('V2I Rate weight: ', self.v2i_weight)
V2I_Weight = self.v2i_weight
# tracking the simulation
Train_Episode_Interval = 200
Train_Step_Interval = 10
# please set Save_Model_Interval and Evaluation_Episode_Interval to 5 if you want to run RL_Evaluated_main_Epsilon_DiffTrails.py
Save_Model_Interval = 5 # if you only want to save the final model, you can set Save_Model_Interval = num_episodes
# save results in their corresponding simulation parameter settings
curr_sim_set = 'Train-Result' + '-RealFB-' + str(self.num_Feedback) + '-Batch-' + str(BATCH_SIZE) \
+ '-Gamma-' + str(GAMMA) \
+ '-V2Iweight-' + str(V2I_Weight)
folder = os.getcwd() + '\\' + curr_sim_set + '\\'
if not os.path.exists(folder):
os.makedirs(folder)
print('Create the new folder while training to save results : ')
print(' --->:', folder)
curr_Result_Dir = folder
# main loop
for Episode_loop in range(self.num_Episodes):
# start a new game for each episode
self.env.new_random_game(self.num_D2D)
# tracking the training process
if (Episode_loop + 1) % Train_Episode_Interval == 0:
Current_DateTime = datetime.datetime.now()
print(Current_DateTime.strftime('%Y/%m/%d %H:%M:%S'))
print('Current Training Episode: ', Episode_loop + 1, ' / Total Training Episodes:', self.num_Episodes)
for Iteration_loop in range(self.num_Train_Step):
# Tracking the simulation
if (Episode_loop + 1) % Train_Episode_Interval == 0 and (Iteration_loop + 1) % Train_Step_Interval == 0:
Current_DateTime = datetime.datetime.now()
print(Current_DateTime.strftime('%Y/%m/%d %H:%M:%S'))
print('Current Training Step: ', Iteration_loop + 1, ' / Total Training Steps:', self.num_Train_Step)
# make several transitions then begin training
Reward_Per_Transition = self.generate_d2d_transition(self.num_transition)
# record the reward per train step
Reward_Per_Train_Step[Episode_loop, Iteration_loop, :] = Reward_Per_Transition
# train the model
[Train_Result, Q_mean, Q_max_mean, Orig_Q_mean, Orig_Q_max_mean] = self.replay()
# record the train loss, Q mean and Q max mean for each D2D
for D_loop in range(Num_D2D):
Loss_Index = 'D' + str(D_loop + 1) + '_Decide_Output_loss'
Train_Loss[D_loop, Episode_loop, Iteration_loop] = Train_Result.history[Loss_Index][0]
# record the target Q value
Train_Q_mean[D_loop, Episode_loop, Iteration_loop] = Q_mean[D_loop]
Train_Q_max_mean[D_loop, Episode_loop, Iteration_loop] = Q_max_mean[D_loop]
# record the original Q Value
Orig_Train_Q_mean[D_loop, Episode_loop, Iteration_loop] = Orig_Q_mean[D_loop]
Orig_Train_Q_max_mean[D_loop, Episode_loop, Iteration_loop] = Orig_Q_max_mean[D_loop]
# update target network
if self.num_step % UPDATE_TARGET_FREQUENCY == 0:
self.brain.update_target_model()
# compute the total reward for each episode
Reward_Per_Episode[Episode_loop] = np.sum(Reward_Per_Train_Step[Episode_loop, :, :])
# Save the model's weights of Q-Function Network and Target Network
if (Episode_loop + 1) % Save_Model_Interval == 0:
# record the current episode index
Curr_Train_Episode = Episode_loop + 1
model_dir = curr_Result_Dir
model_name = 'Q-Network_model_weights' + '-Episode-' + str(Curr_Train_Episode) \
+ '-Step-' + str(num_train_steps) + '-Batch-' + str(BATCH_SIZE) + '.h5'
model_para = model_dir + model_name
# save the weights of Q-Function Network
self.brain.model.save_weights(model_para)
print('Save Q-Function Network model weights after Training at Episode :', Curr_Train_Episode)
# save the Target Network's weights in case we need it
target_model_name = 'Target-Network_model_weights' + '-Episode-' + str(Curr_Train_Episode) \
+ '-Step-' + str(num_train_steps) + '-Batch-' + str(BATCH_SIZE) + '.h5'
target_model_para = model_dir + target_model_name
self.brain.target_model.save_weights(target_model_para)
print('Save Target Network model weights after Training at Episode :', Curr_Train_Episode)
# compute current train loss and Q values for each D2D
Curr_Train_Loss_per_Episode = np.zeros((Num_D2D, num_episodes))
Curr_Train_Q_mean_per_Episode = np.zeros((Num_D2D, num_episodes))
Curr_Train_Q_max_mean_per_Episode = np.zeros((Num_D2D, num_episodes))
Curr_Orig_Train_Q_mean_per_Episode = np.zeros((Num_D2D, num_episodes))
Curr_Orig_Train_Q_max_mean_per_Episode = np.zeros((Num_D2D, num_episodes))
for D_loop in range(Num_D2D):
# Train_Loss
Curr_Train_Loss_per_Episode[D_loop, :] = np.sum(Train_Loss[D_loop, :, :],
axis=1) / num_train_steps
# Target Q value
Curr_Train_Q_mean_per_Episode[D_loop, :] = np.sum(Train_Q_mean[D_loop, :, :],
axis=1) / num_train_steps
Curr_Train_Q_max_mean_per_Episode[D_loop, :] = np.sum(Train_Q_max_mean[D_loop, :, :],
axis=1) / num_train_steps
# Original Q value
Curr_Orig_Train_Q_mean_per_Episode[D_loop, :] = np.sum(Orig_Train_Q_mean[D_loop, :, :],
axis=1) / num_train_steps
Curr_Orig_Train_Q_max_mean_per_Episode[D_loop, :] = np.sum(Orig_Train_Q_max_mean[D_loop, :, :],
axis=1) / num_train_steps
Data_Dir = curr_Result_Dir
Data_Name = 'Temp-Training-Result' + '-Episode-' + str(Curr_Train_Episode) \
+ '-Step-' + str(num_train_steps) + '-Batch-' + str(BATCH_SIZE) + '.pkl'
Data_Para = Data_Dir + Data_Name
# open data file
file_to_open = open(Data_Para, 'wb')
# write D2D_Sample to data file
pickle.dump((Curr_Train_Episode,
Curr_Train_Loss_per_Episode, Train_Loss,
Curr_Train_Q_mean_per_Episode, Curr_Train_Q_max_mean_per_Episode,
Curr_Orig_Train_Q_mean_per_Episode, Curr_Orig_Train_Q_max_mean_per_Episode,
Reward_Per_Train_Step, Reward_Per_Episode), file_to_open)
file_to_open.close()
return Train_Loss, Reward_Per_Train_Step, Reward_Per_Episode, \
Train_Q_mean, Train_Q_max_mean, Orig_Train_Q_mean, Orig_Train_Q_max_mean
def generate_d2d_initial_states(self):
# generate initial states for RL to Test
Train_D2D_CH_State = np.zeros((self.num_D2D, self.num_Neighbor, self.num_CH))
Train_D2D_Edge_State = np.zeros((self.num_D2D, self.num_Neighbor, self.num_CH))
Train_D2D_V2I_CH_State = np.zeros((self.num_D2D, self.num_CH))
Fixed_Power = self.env.V2V_power_dB_List[self.env.fixed_v2v_power_index]
Train_D2D_Power_State = Fixed_Power * np.ones((self.num_D2D, self.num_Neighbor))
# Get all D2D channel, interference states and adjacency matrix for Testing, the real adjacency matrix (no transpose)
Adjacency_Matrix = np.ones((self.num_D2D, self.num_D2D)) - np.eye(self.num_D2D)
for D2D_loop in range(self.num_D2D):
for veh_loop in range(self.num_D2D):
if D2D_loop == self.env.vehicles[veh_loop].destinations[0]:
Adjacency_Matrix[D2D_loop, veh_loop] = 0
for D2D_loop in range(self.num_D2D):
for Neighbor_loop in range(self.num_Neighbor):
# Input: indx[0] = target vehicle index, indx[1] = neighbor index
index = [D2D_loop, Neighbor_loop]
[V2V_channel, V2I_channel, V2V_edge] = self.get_state(index)
Train_D2D_CH_State[D2D_loop, Neighbor_loop, :] = V2V_channel
Train_D2D_Edge_State[D2D_loop, Neighbor_loop, :] = V2V_edge
Train_D2D_V2I_CH_State[D2D_loop, :] = V2I_channel
# reshape the States for all D2D
Num_One_D2D_Input = self.brain.num_One_D2D_Input
Num_One_Node_Input = self.brain.num_One_Node_Input
Num_One_Edge_Input = self.brain.num_One_Edge_Input
Num_Neighbor_Input = self.brain.num_Feedback
D2D_State = np.zeros((self.num_D2D, Num_One_D2D_Input))
CH_gain_Index = self.num_Neighbor*self.num_CH
CH_edge_Index = self.num_Neighbor*self.num_CH
for D2D_loop in range(self.num_D2D):
Current_CH_gain = np.reshape(Train_D2D_CH_State[D2D_loop, :, :], [1, CH_gain_Index])
D2D_State[D2D_loop, 0:CH_gain_Index] = Current_CH_gain
Current_V2I_gain = Train_D2D_V2I_CH_State[D2D_loop, :]
D2D_State[D2D_loop, CH_gain_Index: 2 * CH_gain_Index] = Current_V2I_gain
D2D_State[D2D_loop, 2 * CH_gain_Index: 2 * CH_gain_Index + self.num_Neighbor] = Train_D2D_Power_State[D2D_loop, :]
Current_edge_gain = np.reshape(Train_D2D_Edge_State[D2D_loop, :, :], [1, CH_edge_Index])
D2D_State[D2D_loop, 2 * CH_gain_Index + self.num_Neighbor:] = Current_edge_gain
D1_Initial_State_Node = np.reshape(D2D_State[0, :Num_One_Node_Input], [1, Num_One_Node_Input])
D1_Initial_State_Edge = np.reshape(D2D_State[0, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D1_Initial_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
D2_Initial_State_Node = np.reshape(D2D_State[1, :Num_One_Node_Input], [1, Num_One_Node_Input])
D2_Initial_State_Edge = np.reshape(D2D_State[1, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D2_Initial_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
D3_Initial_State_Node = np.reshape(D2D_State[2, :Num_One_Node_Input], [1, Num_One_Node_Input])
D3_Initial_State_Edge = np.reshape(D2D_State[2, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D3_Initial_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
D4_Initial_State_Node = np.reshape(D2D_State[3, :Num_One_Node_Input], [1, Num_One_Node_Input])
D4_Initial_State_Edge = np.reshape(D2D_State[3, Num_One_Node_Input:], [1, Num_One_Edge_Input])
D4_Initial_State_Neighbor = np.zeros((1, Num_Neighbor_Input))
A_Matrix = np.reshape(np.kron(Adjacency_Matrix, np.eye(Num_Neighbor_Input)),
[1, self.num_D2D*Num_Neighbor_Input, self.num_D2D*Num_Neighbor_Input])
Initial_States = {'D1_Node_Input': D1_Initial_State_Node, 'D1_Edge_Input': D1_Initial_State_Edge,
'D1_Neighbor_Input': D1_Initial_State_Neighbor,
'D2_Node_Input': D2_Initial_State_Node, 'D2_Edge_Input': D2_Initial_State_Edge,
'D2_Neighbor_Input': D2_Initial_State_Neighbor,
'D3_Node_Input': D3_Initial_State_Node, 'D3_Edge_Input': D3_Initial_State_Edge,
'D3_Neighbor_Input': D3_Initial_State_Neighbor,
'D4_Node_Input': D4_Initial_State_Node, 'D4_Edge_Input': D4_Initial_State_Edge,
'D4_Neighbor_Input': D4_Initial_State_Neighbor,
'Adjacency_Matrix': A_Matrix
}
return Initial_States
def test_run(self, num_episodes, num_test_step, opt_flag):
# define run() to Test the trained RL-DNN
self.num_Episodes = num_episodes
self.num_Test_Step = num_test_step
# weight for the V2V sum rate
v2v_weight = self.v2v_weight
# weight for the V2I sum rate
v2i_weight = self.v2i_weight
# initialize variables to save the results
Expect_Return = np.zeros(self.num_Episodes)
Reward = np.zeros((self.num_Episodes, self.num_Test_Step))
Per_V2V_Rate = np.zeros((self.num_Episodes, self.num_Test_Step, self.num_D2D))
Per_V2I_Rate = np.zeros((self.num_Episodes, self.num_Test_Step, self.num_CH))