-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmain_DDPGfD.py
1345 lines (1132 loc) · 67.4 KB
/
main_DDPGfD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import numpy as np
import torch
import gym
import argparse
import os, sys
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import utils
#import TD3
#import OurDDPG
#import DDPG
import DDPGfD
import pdb
from tensorboardX import SummaryWriter
from ounoise import OUNoise
import pickle
import datetime
import csv
import timer
from expert_data import GenerateExpertPID_JointVel, GenerateTestPID_JointVel, check_grasp
from timer import Timer
from pathlib import Path
import pathlib
import copy # For copying over coordinates
import glob # Used for getting saved policy filename
# Import plotting code from other directory
plot_path = os.getcwd() + "/plotting_code"
sys.path.insert(1, plot_path)
from heatmap_plot import generate_heatmaps
from boxplot_plot import generate_reward_boxplots
from heatmap_coords import add_heatmap_coords, filter_heatmap_coords
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device('cpu')
def compare_test():
""" Compare policy performance """
# eval_env = gym.make(env_name)
# eval_env.seed(seed + 100)
#
# print("***Eval In Compare")
# # Generate randomized list of objects to select from
# eval_env.Generate_Latin_Square(eval_episodes,"eval_objects.csv",shape_keys=requested_shapes)
# ep_count = 0
# avg_reward = 0.0
# # step = 0
# for i in range(40):
# if i<23:
# x=(i)*0.005-0.055
# y=0.0
# elif i>=23:
# x=(i-23)*0.005-0.04
# y=0.0
# print('started pos', i)
# cumulative_reward = 0
# #eval_env = gym.make(env_name)
# state, done = eval_env.reset(shape_keys=requested_shapes, with_grasp=args.with_grasp_reward,start_pos=[x,y],env_name="eval_env", hand_orientation=requested_orientation,mode=mode), False
# # Set whether or not to use grasp reward
# eval_env.set_with_grasp_reward(args.with_grasp_reward)
# success=0
# # Sets number of timesteps per episode (counted from each step() call)
# #eval_env._max_episode_steps = 200
# eval_env._max_episode_steps = max_num_timesteps
# while not done:
# action = GenerateTestPID_JointVel(np.array(state),eval_env)
# env.set_with_grasp_reward(args.with_grasp_reward)
# state, reward, done, _ = eval_env.step(action)
# avg_reward += reward
# cumulative_reward += reward
# if reward > 25:
# success=1
# num_success[1]+=success
# print('PID net reward:',cumulative_reward)
# state, done = eval_env.reset(shape_keys=requested_shapes, with_grasp=args.with_grasp_reward,start_pos=[x,y],env_name="eval_env", hand_orientation=requested_orientation,mode=mode), False
# # Set whether or not to use grasp reward
# eval_env.set_with_grasp_reward(args.with_grasp_reward)
# success=0
# cumulative_reward = 0
# # Sets number of timesteps per episode (counted from each step() call)
# #eval_env._max_episode_steps = 200
# eval_env._max_episode_steps = max_num_timesteps
# # Keep track of object coordinates
# obj_coords = eval_env.get_obj_coords()
# ep_count += 1
# print("***Eval episode: ",ep_count)
#
# timestep_count = 0
# while not done:
# timestep_count += 1
# action = policy.select_action(np.array(state[0:82]))
# env.set_with_grasp_reward(args.with_grasp_reward)
# state, reward, done, _ = eval_env.step(action)
# avg_reward += reward
# cumulative_reward += reward
# if reward > 25:
# success=1
# # eval_env.render()
# # print(reward)
# # pdb.set_trace()
# print('Policy net reward:',cumulative_reward)
# num_success[0]+=success
# print("Eval timestep count: ",timestep_count)
#
# # Save initial object coordinate as success/failure
# x_val = (obj_coords[0])
# y_val = (obj_coords[1])
# x_val = np.asarray(x_val).reshape(1)
# y_val = np.asarray(y_val).reshape(1)
#
# if (success):
# seval_obj_posx = np.append(seval_obj_posx, x_val)
# seval_obj_posy = np.append(seval_obj_posy, y_val)
# else:
# feval_obj_posx = np.append(feval_obj_posx, x_val)
# feval_obj_posy = np.append(feval_obj_posy, y_val)
#
# total_evalx = np.append(total_evalx, x_val)
# total_evaly = np.append(total_evaly, y_val)
#
# avg_reward /= eval_episodes
#
# print("---------------------------------------")
# # print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
# print("Evaluation over {} episodes: {}".format(eval_episodes, avg_reward))
# print("---------------------------------------")
# Runs policy for X episodes and returns average reward
def eval_policy(policy, env_name, seed, requested_shapes, requested_orientation, mode, eval_episodes=100, compare=False):
""" Evaluate policy in its given state over eval_episodes amount of grasp trials """
num_success=0
# Heatmap plot success/fail object coordinates
success_coords = {"x": [], "y": [], "orientation": []}
fail_coords = {"x": [], "y": [], "orientation": []}
# hand orientation types: NORMAL, Rotated (45 deg), Top (90 deg)
# match timesteps to expert and pre-training
max_num_timesteps = 30
# Compare policy performance
if compare:
compare_test()
# Make new environment for evaluation
eval_env = gym.make(env_name)
eval_env.seed(seed + 100)
# Generate randomized list of objects to select from
eval_env.Generate_Latin_Square(eval_episodes,"eval_objects.csv",shape_keys=requested_shapes)
avg_reward = 0.0
# Reward data over each evaluation episode for boxplot
all_ep_reward_values = {"total_reward": [], "finger_reward": [], "grasp_reward": [], "lift_reward": []}
for i in range(eval_episodes):
print("***Eval episode: ", i)
success=0
state, done = eval_env.reset(shape_keys=requested_shapes, with_grasp=args.with_grasp_reward,hand_orientation=requested_orientation,mode=args.mode,env_name="eval_env"), False
# Record initial coordinate file path once shapes are generated
all_saving_dirs["eval_init_coord_file_path"] = eval_env.get_coords_filename()
cumulative_reward = 0
# Sets number of timesteps per episode (counted from each step() call)
eval_env._max_episode_steps = max_num_timesteps
# Keep track of object coordinates
obj_coords = eval_env.get_obj_coords()
# Local coordinate conversion
obj_local = np.append(obj_coords,1)
obj_local = np.matmul(eval_env.Tfw,obj_local)
obj_local_pos = obj_local[0:3]
timestep_count = 0
prev_state_lift_check = None
curr_state_lift_check = state
check_for_lift = True
ready_for_lift = False
skip_num_ts = 6
curr_reward = 0
# Cumulative reward over single episode
ep_total_reward = 0
ep_finger_reward = 0
ep_grasp_reward = 0
ep_lift_reward = 0
# Beginning of episode time steps, done is max timesteps or lift reward achieved
while not done:
timestep_count += 1
if timestep_count < skip_num_ts:
wait_for_check = False
else:
wait_for_check = True
##make it modular
## If None, skip
if prev_state_lift_check is None:
f_dist_old = None
else:
f_dist_old = prev_state_lift_check[9:17]
f_dist_new = curr_state_lift_check[9:17]
if check_for_lift and wait_for_check:
[ready_for_lift, _] = check_grasp(f_dist_old, f_dist_new)
#####
# Not ready for lift, continue agent grasping following the policy
if not ready_for_lift:
action = policy.select_action(np.array(state[0:82])) # Due to the sigmoid should be between (0,max_action)
eval_env.set_with_grasp_reward(args.with_grasp_reward)
next_state, reward, done, info = eval_env.step(action)
cumulative_reward += reward
avg_reward += reward
curr_reward = reward
# Cumulative reward
ep_total_reward += reward
ep_finger_reward += info["finger_reward"]
ep_grasp_reward += info["grasp_reward"]
ep_lift_reward += info["lift_reward"]
else: # Make it happen in one time step
next_state, reward, done, info, cumulative_reward = eval_lift_hand(eval_env, cumulative_reward,
curr_reward)
#print("Ready for lift < 0.035, np.max(np.array(obs[41:46])): ", np.max(np.array(state[41:46])))
#print("Ready for lift < 0.015, np.max(np.array(obs[35:40])): ", np.max(np.array(state[35:40])))
if done:
avg_reward += reward
# Cumulative reward
ep_total_reward += reward
ep_finger_reward += info["finger_reward"]
ep_grasp_reward += info["grasp_reward"]
ep_lift_reward += info["lift_reward"]
check_for_lift = False
#####
if reward > 25:
success=1
state = next_state
prev_state_lift_check = curr_state_lift_check
curr_state_lift_check = state
# End of episode, record findings
all_ep_reward_values["total_reward"].append(ep_total_reward)
all_ep_reward_values["finger_reward"].append(ep_finger_reward)
all_ep_reward_values["grasp_reward"].append(ep_grasp_reward)
all_ep_reward_values["lift_reward"].append(ep_lift_reward)
num_success+=success
# Add heatmap coordinates
orientation = env.get_orientation()
ret = add_heatmap_coords(success_coords, fail_coords, orientation, obj_local_pos, success)
success_coords = copy.deepcopy(ret["success_coords"])
fail_coords = copy.deepcopy(ret["fail_coords"])
avg_reward /= eval_episodes
# Final average reward values over all episodes
avg_rewards = {}
avg_rewards["total_reward"] = np.average(all_ep_reward_values["total_reward"])
avg_rewards["finger_reward"] = np.average(all_ep_reward_values["finger_reward"])
avg_rewards["grasp_reward"] = np.average(all_ep_reward_values["grasp_reward"])
avg_rewards["lift_reward"] = np.average(all_ep_reward_values["lift_reward"])
print("---------------------------------------")
print("Evaluation over {} episodes: {}".format(eval_episodes, avg_reward))
print("---------------------------------------")
ret = {"avg_reward": avg_reward, "avg_rewards": avg_rewards, "all_ep_reward_values": all_ep_reward_values, "num_success": num_success, "success_coords": success_coords, "fail_coords": fail_coords}
return ret
def lift_hand(env_lift, tot_reward):
""" Lift hand with set action velocities
env_lift: Mujoco environment
tot_reward: Total cumulative reward
"""
# action only used to move hand, not recorded in replay buffer and is NOT used to update policy
action = np.array([wrist_lift_velocity, finger_lift_velocity, finger_lift_velocity,
finger_lift_velocity])
env_lift.with_grasp_reward = args.with_grasp_reward
next_state, reward, done, info = env_lift.step(action)
if done:
# accumulate or replace?
old_reward = replay_buffer.replace(reward, done)
tot_reward = tot_reward - old_reward + reward
return next_state, reward, done, info, tot_reward
def eval_lift_hand(env_lift, tot_reward, curr_reward):
""" Lift hand with set action velocities within evaluation environment
env_lift: Mujoco environment
tot_reward: Total cumulative reward
curr_reward: Current time step reward
"""
# action only used to move hand, not recorded in replay buffer and is NOT used to update policy
action = np.array([wrist_lift_velocity, finger_lift_velocity, finger_lift_velocity,
finger_lift_velocity])
env_lift.with_grasp_reward = args.with_grasp_reward
next_state, reward, done, info = env_lift.step(action)
if done:
tot_reward = tot_reward - curr_reward + reward
return next_state, reward, done, info, tot_reward
def write_tensor_plot(writer,episode_num,avg_reward,avg_rewards,actor_loss,critic_loss,critic_L1loss,critic_LNloss):
""" Write important data about policy performance to tensorboard Summary Writer
writer: Tensorboard Summary Writer
episode_num: Current episode number
avg_reward: Reward average over evaulation grasp trials
avg_rewards: Dictionary of average reward values for Finger, Grasp, and Lift rewards over evaluation grasp trials
actor_loss,critic_loss,critic_L1loss,critic_LNloss
"""
writer.add_scalar("Episode total reward, Avg. " + str(args.eval_freq) + " episodes", avg_reward, episode_num)
writer.add_scalar("Episode finger reward, Avg. " + str(args.eval_freq) + " episodes", avg_rewards["finger_reward"],
episode_num)
writer.add_scalar("Episode grasp reward, Avg. " + str(args.eval_freq) + " episodes",
avg_rewards["grasp_reward"], episode_num)
writer.add_scalar("Episode lift reward, Avg. " + str(args.eval_freq) + " episodes",
avg_rewards["lift_reward"], episode_num)
writer.add_scalar("Actor loss", actor_loss, episode_num)
writer.add_scalar("Critic loss", critic_loss, episode_num)
writer.add_scalar("Critic L1loss", critic_L1loss, episode_num)
writer.add_scalar("Critic LNloss", critic_LNloss, episode_num)
return writer
def update_policy(evaluations, episode_num, num_episodes, prob,
type_of_training, saving_dir, max_num_timesteps=30):
""" Update policy network based on expert or agent step, evaluate every eval_freq episodes
evaluations: Output average reward list for plotting
episode_num: Current episode
num_episodes: Max number of episodes to update over
writer: Tensorboard writer (avg. reward, loss, etc.)
prob: Probability (proportion) of sampling from expert replay buffer
type_of_training: Based on training mode ("pre-train", "eval", "test", etc.)
max_num_timesteps: Maximum number of time steps within a RL episode
"""
# Initialize OU noise, added to action output from policy
noise = OUNoise(4)
noise.reset()
expl_noise = OUNoise(4, sigma=0.001)
expl_noise.reset()
# Heatmap initial object coordinates for evaluation plots
eval_success_coords = {"x": [], "y": [], "orientation": []}
eval_fail_coords = {"x": [], "y": [], "orientation": []}
# hand orientation types: NORMAL, Rotated (45 deg), Top (90 deg)
# Number of successful/failed initial object coordinates from evaluation over the total # of grasp trials
eval_num_success = 0
eval_num_fail = 0
eval_num_total = eval_num_success + eval_num_fail
# Stores reward boxplot data, Average reward per evaluation episodes
finger_reward = [[]]
grasp_reward = [[]]
lift_reward = [[]]
total_reward = [[]]
# Setup plotting output directories
if args.mode == "experiment":
heatmap_eval_dir = saving_dir + "/output/heatmap/eval" # Heatmap plot saving directory
boxplot_eval_dir = saving_dir + "/output/boxplot/eval" # Boxplot saving directory
create_paths([heatmap_eval_dir, boxplot_eval_dir]) # Create paths if they do not exist
else:
output_dir = "./output/" + saving_dir # Different output directory than experiments
heatmap_eval_dir = output_dir + "/heatmap/eval"
boxplot_eval_dir = output_dir + "/boxplot/eval"
create_paths([output_dir, heatmap_eval_dir, boxplot_eval_dir])
# Tensorboard writer
writer = SummaryWriter(logdir=all_saving_dirs["tensorboard_dir"])
for _ in range(num_episodes):
env = gym.make(args.env_name)
# Max number of time steps to match the expert replay grasp trials
env._max_episode_steps = max_num_timesteps
# Fill training object list using latin square
if env.check_obj_file_empty("objects.csv"):
env.Generate_Latin_Square(args.max_episode, "objects.csv", shape_keys=requested_shapes)
state, done = env.reset(shape_keys=requested_shapes, with_grasp=args.with_grasp_reward,env_name="env", hand_orientation=requested_orientation,
mode=args.mode), False
# Set whether or not to use grasp reward
env.set_with_grasp_reward(args.with_grasp_reward)
# Record initial coordinate file path once shapes are generated
all_saving_dirs["train_init_coord_file_path"] = env.get_coords_filename()
prev_state_lift_check = None
curr_state_lift_check = state
noise.reset()
expl_noise.reset()
episode_reward = 0
obj_coords = env.get_obj_coords()
# Local coordinate conversion
obj_local = np.append(obj_coords,1)
obj_local = np.matmul(env.Tfw,obj_local)
obj_local_pos = obj_local[0:3]
replay_buffer.add_episode(1)
# Add orientation noise to be recorded by replay buffer
orientation_idx = env.get_orientation_idx()
replay_buffer.add_orientation_idx_to_replay(orientation_idx)
timestep = 0
replay_buffer_recorded_ts = 0
check_for_lift = True
ready_for_lift = False
skip_num_ts = 6
lift_success = False # Set based on lift reward output at end of episode
print(type_of_training, episode_num)
# Beginning of time steps within episode
while not done:
timestep = timestep + 1
if timestep < skip_num_ts:
wait_for_check = False
else:
wait_for_check = True
##make it modular
## If None, skip
if prev_state_lift_check is None:
f_dist_old = None
else:
f_dist_old = prev_state_lift_check[9:17]
f_dist_new = curr_state_lift_check[9:17]
if check_for_lift and wait_for_check:
[ready_for_lift, _] = check_grasp(f_dist_old, f_dist_new)
# Follow policy until ready for lifting, then switch to set controller
if not ready_for_lift:
action = (
policy.select_action(np.array(state))
+ np.random.normal(0, max_action * args.expl_noise, size=action_dim)
).clip(0, max_action)
# Perform action obs, total_reward, done, info
env.set_with_grasp_reward(args.with_grasp_reward)
next_state, reward, done, info = env.step(action)
replay_buffer.add(state[0:82], action, next_state[0:82], reward, float(done))
replay_buffer_recorded_ts += 1
episode_reward += reward
else: # Make it happen in one time step
next_state, reward, done, info, episode_reward = lift_hand(env, episode_reward)
check_for_lift = False
# Determine successful grasp based on lift reward
if info["lift_reward"] > 0:
lift_success = True
state = next_state
prev_state_lift_check = curr_state_lift_check
curr_state_lift_check = state
replay_buffer.add_episode(0) # Add entry for new episode
# Remove any invalid episodes (episodes shorter than n-step length for policy training)
episode_len = replay_buffer_recorded_ts # Number of timesteps within the episode recorded by replay buffer
if episode_len - replay_buffer.n_steps <= 1:
replay_buffer.remove_episode(-1) # If episode is invalid length (less that n-steps), remove it
# Train agent after collecting sufficient data:
if episode_num > args.update_after: # Update policy after 100 episodes (have enough experience in agent replay buffer)
#if episode_num % args.update_freq: # Update every 4 episodes
for learning in range(args.update_num): # Number of times to update the policy
if args.batch_size == 0:
# Single episode training using full trajectory
actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train(env._max_episode_steps,
expert_replay_buffer,
replay_buffer, prob)
else:
# Batch training using n-steps
actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch(env._max_episode_steps,
expert_replay_buffer,
replay_buffer, prob)
# Evaluation and recording data for tensorboard
if episode_num+1 == num_episodes or (episode_num >= args.update_after and (episode_num) % args.eval_freq == 0):
print("EVALUATING EPISODE AT: ",episode_num)
print("Evaluating with "+str(args.eval_num)+" grasping trials")
eval_ret = eval_policy(policy, args.env_name, args.seed, requested_shapes, requested_orientation,
mode=args.mode, eval_episodes=args.eval_num) # , compare=True)
# Heatmap data - object starting coordinates for evaluation
eval_success_coords = copy.deepcopy(eval_ret["success_coords"])
eval_fail_coords = copy.deepcopy(eval_ret["fail_coords"])
# Records the number of successful and failed coordinates from evaluation
eval_num_success = len(eval_success_coords["x"])
eval_num_fail = len(eval_success_coords["x"])
eval_num_total = eval_num_success + eval_num_fail
# Cumulative (over timesteps) reward data from each evaluation episode for boxplot
all_ep_reward_values = eval_ret["all_ep_reward_values"]
# Plot tensorboard metrics for learning analysis (average reward, loss, etc.)
writer = write_tensor_plot(writer,episode_num,eval_ret["avg_reward"],eval_ret["avg_rewards"],actor_loss,critic_loss,critic_L1loss,critic_LNloss)
# Insert boxplot code reference
finger_reward[-1].append(all_ep_reward_values["finger_reward"])
grasp_reward[-1].append(all_ep_reward_values["grasp_reward"])
lift_reward[-1].append(all_ep_reward_values["lift_reward"])
total_reward[-1].append(all_ep_reward_values["total_reward"])
# Save coordinates every 1000 episodes
if episode_num >= args.update_after and (episode_num) % args.save_freq == 0:
print("Saving heatmap data at: ", heatmap_eval_dir)
# Filter heatmap coords by success/fail, orientation type, and save to appropriate place
filter_heatmap_coords(eval_success_coords, eval_fail_coords, episode_num, heatmap_eval_dir)
# Reset eval coords for next batch
eval_success_coords = {"x": [], "y": [], "orientation": []}
eval_fail_coords = {"x": [], "y": [], "orientation": []}
print("Saving boxplot data at: ", boxplot_eval_dir)
np.save(boxplot_eval_dir + "/finger_reward_" + str(episode_num),finger_reward)
np.save(boxplot_eval_dir + "/grasp_reward_" + str(episode_num),grasp_reward)
np.save(boxplot_eval_dir + "/lift_reward_" + str(episode_num),lift_reward)
np.save(boxplot_eval_dir + "/total_reward_" + str(episode_num),total_reward)
finger_reward = [[]]
grasp_reward = [[]]
lift_reward = [[]]
total_reward = [[]]
episode_num += 1
return evaluations, episode_num, eval_num_success, eval_num_fail
def create_paths(dir_list):
""" Create directories if they do not exist already, given path """
for new_dir in dir_list:
if new_dir is not None:
new_path = Path(new_dir)
new_path.mkdir(parents=True, exist_ok=True)
def setup_directories(env, saving_dir, replay_filename, expert_replay_file_path, agent_replay_file_path, pretrain_model_save_path, create_dirs=True):
""" Setup directories where information will be saved
env: Pass in current environment to have access to getting environment variables for recording purposes
saving_dir: main name for all related files (ex: train_DDPGfD_CubeS)
expert_replay_file_path: Expert replay buffer file path
agent_replay_file_path: Agent replay buffer file path
pretrain_model_save_path: Pre-train policy file path
"""
# Store all directory names where information is saved
all_saving_dirs = {}
# Experiment output
if args.mode == "experiment":
model_save_path = saving_dir + "/policy/exp_policy"
tensorboard_dir = saving_dir + "/output/tensorboard/"
output_dir = saving_dir + "/output"
heatmap_train_dir = saving_dir + "/output/heatmap/train"
results_saving_dir = saving_dir + "/output/results"
elif args.mode == "combined" or args.mode == "naive" or args.mode == "position-dependent":
output_dir = saving_dir + "/output"
heatmap_train_dir = output_dir + "/heatmap/expert"
model_save_path = "None"
results_saving_dir = "None"
tensorboard_dir = "None"
else:
print("---------- STARTING: ", args.mode, " ---------")
# Original saving directory locations for model and tensorboard
model_save_path = "./policies/" + saving_dir + "/{}_{}".format(args.mode, "DDPGfD_kinovaGrip") + datestr + "/"
tensorboard_dir = "./kinova_gripper_strategy/" + saving_dir + "/{}/".format(args.tensorboardindex)
output_dir = "./output/" + saving_dir
heatmap_train_dir = output_dir + "/heatmap" + "/" + args.mode
results_saving_dir = output_dir + "/results" + "/" + args.mode
# Create directory paths if they do not exist
if create_dirs is True:
create_paths([model_save_path, output_dir, tensorboard_dir, heatmap_train_dir, results_saving_dir])
all_saving_dirs["saving_dir"] = saving_dir
all_saving_dirs["model_save_path"] = model_save_path
all_saving_dirs["output_dir"] = output_dir
all_saving_dirs["tensorboard_dir"] = tensorboard_dir
all_saving_dirs["heatmap_train_dir"] = heatmap_train_dir
all_saving_dirs["results_saving_dir"] = results_saving_dir
all_saving_dirs["replay_buffer"] = replay_filename
all_saving_dirs["expert_replay_file_path"] = expert_replay_file_path
all_saving_dirs["agent_replay_file_path"] = agent_replay_file_path
all_saving_dirs["pretrain_model_save_path"] = pretrain_model_save_path
all_saving_dirs["train_init_coord_file_path"] = env.get_coords_filename()
all_saving_dirs["eval_init_coord_file_path"] = env.get_coords_filename()
all_saving_dirs["controller_init_coord_file_path"] = env.get_coords_filename()
return all_saving_dirs
def train_policy(tot_episodes, tr_prob, all_saving_dirs):
""" Train the policy over a number of episodes, sampling from experience
tot_episodes: Total number of episodes to update policy over
tr_prob: Probability of sampling from expert replay buffer within training
"""
evals = []
curr_episode = 0 # Counts number of episodes done within training
red_expert_prob= 0.1
# Begin training updates
evals, curr_episode, eval_num_success, eval_num_fail = \
update_policy(evals, curr_episode, tot_episodes, tr_prob,"TRAIN", all_saving_dirs["saving_dir"])
tr_prob = tr_prob - red_expert_prob
eval_num_total = eval_num_success + eval_num_fail
# Save policy
print("Saving policy...")
policy.save(all_saving_dirs["model_save_path"])
return all_saving_dirs["model_save_path"], eval_num_success, eval_num_total
def get_experiment_info(exp_num):
""" Get stage and name of current experiment and pre-trained experiment
exp_num: Experiment number
"""
# Experiment #: [pretrain_policy_exp #, stage_policy]
stage0 = "pretrain_policy" # Expert policy with small cube
stage1 = {"1": ["0", "sizes"], "2": ["0", "shapes"], "3": ["0", "orientations"]}
stage2 = {"4": ["1", "sizes_shapes_orientations"], "5": ["2", "shapes_sizes_orientations"], "6": ["3", "orientations_sizes_shapes"]}
#stage2 = {"4": ["1", "sizes_shapes"], "5": ["1", "sizes_orientations"], "6": ["2", "shapes_orientations"],
# "7": ["2", "shapes_sizes"], "8": ["3", "orientations_sizes"], "9": ["3", "orientations_shapes"]}
#stage3 = {"10": ["4", "sizes_shapes_orientations"], "11": ["5", "sizes_orientations_shapes"],
# "12": ["6", "shapes_orientations_sizes"], "13": ["7", "shapes_sizes_orientations"],
# "14": ["8", "orientations_sizes_shapes"], "15": ["9", "orientations_shapes_sizes"]}
print("stage1.keys(): ",stage1.keys())
print("exp_num: ",exp_num)
exp_num = str(exp_num)
if exp_num in stage1.keys():
prev_exp_stage = "0"
exp_stage = "1"
prev_exp_num = "0"
prev_exp_name = stage0
exp_name = stage1[exp_num][1]
elif exp_num in stage2.keys():
prev_exp_stage = "1"
exp_stage = "2"
prev_exp_num = stage2[exp_num][0]
prev_exp_name = stage1[prev_exp_num][1]
exp_name = stage2[exp_num][1]
elif exp_num in stage3.keys():
prev_exp_stage = "2"
exp_stage = "3"
prev_exp_num = stage3[exp_num][0]
prev_exp_name = stage2[prev_exp_num][1]
exp_name = stage3[exp_num][1]
elif exp_num == 16:
prev_exp_stage = "0"
exp_stage = "kitchen_sink"
prev_exp_num = "0"
prev_exp_name = stage0
exp_name = "kitchen_sink"
else:
print("Invalid experiment option: ", exp_num)
raise ValueError
return prev_exp_stage, prev_exp_num, prev_exp_name, exp_stage, exp_name
def get_experiment_file_structure(prev_exp_stage, prev_exp_name, exp_stage, exp_name):
""" Setup experiment file structure with directories for the policy and plot output
prev_exp_stage: Prev exp stage
prev_exp_name: Previous exp name
exp_stage: Current experiment stage
exp_name: Current experiment name
"""
rl_exp_base_dir = "rl_experiments"
grasp_dir = "/no_grasp"
if args.with_grasp_reward is True:
grasp_dir = "/with_grasp"
stage_dir = "/stage" + exp_stage
exp_dir = rl_exp_base_dir + grasp_dir + stage_dir + "/" + exp_name
policy_dir = exp_dir+"/policy"
replay_dir = exp_dir + "/replay_buffer"
output_dir = exp_dir+"/output"
create_paths([exp_dir, policy_dir, replay_dir, output_dir])
expert_replay_dir = "./expert_replay_data" + grasp_dir + "/combined"
if not os.path.isdir(expert_replay_dir):
print("Expert replay buffer experience directory not found!: ", expert_replay_dir)
prev_exp_dir = rl_exp_base_dir + grasp_dir + "/" + "stage" + prev_exp_stage + "/" + prev_exp_name
if not os.path.isdir(prev_exp_dir):
print("Previous experiment directory not found!: ", prev_exp_dir)
pretrain_replay_dir = prev_exp_dir + "/replay_buffer/"
if not os.path.isdir(pretrain_replay_dir):
print("Previous experiment Replay Buffer directory not found!: ", pretrain_replay_dir)
pretrain_policy_dir = prev_exp_dir + "/replay_buffer/"
if not os.path.isdir(pretrain_policy_dir):
print("Previous experiment Policy directory not found!: ", pretrain_policy_dir)
return expert_replay_dir, prev_exp_dir, exp_dir
def get_exp_input(exp_name, shapes, sizes):
""" Return the correct shapes, sizes, and orientations based on requested experiment
exp_name: Experiment name (sizes, shapes, orientations)
shapes: All shape options
sizes: All shape sizes
"""
exp_types = exp_name.split('_')
exp_shapes = []
if exp_name == "kitchen_sink":
exp_types = ["shapes", "sizes", "orientations"]
# All shapes
if "shapes" in exp_types and "sizes" in exp_types:
for size in sizes:
exp_shapes += [shape + size for shape in shapes]
elif "shapes" in exp_types:
exp_shapes += [shape + "S" for shape in shapes]
elif "sizes" in exp_types:
exp_shapes += ["Cube" + size for size in sizes]
else:
exp_shapes += ["CubeS"]
# All orientations
if "orientations" in exp_types:
exp_orientation = "random"
else:
exp_orientation = "normal"
return exp_shapes, exp_orientation
def generate_output(text,data_dir,orientations_list,saving_dir,num_success, num_total, all_saving_dirs):
""" Generate heatmaps, boxplots, and output info file """
# Produce plots
# Train Heatmap
mode_str = "/heatmap/"+str(args.mode)+"/"
if os.path.isdir(data_dir+mode_str) is True:
print("Generating heatmaps...")
for orientation in orientations_list:
generate_heatmaps(plot_type="train", orientation=str(orientation), data_dir=data_dir+mode_str+orientation+"/",
saving_dir=saving_dir+mode_str+orientation+"/")
else:
print("Heatmap dir NOT found: ", data_dir+mode_str)
if os.path.isdir(data_dir + "/heatmap/eval") is True:
print("Generating evaluation heatmaps...")
# Evaluation Heatmaps
for orientation in orientations_list:
generate_heatmaps(plot_type="eval", orientation=str(orientation), data_dir=data_dir+"heatmap/eval/"+orientation+"/",
saving_dir=saving_dir+"heatmap/eval/"+orientation+"/")
elif args.mode == "eval":
print("Eval Heatmap dir NOT found: ", data_dir + "/heatmap/eval")
if os.path.isdir(data_dir+"/boxplot/eval/") is True:
print("Generating boxplots...")
# Boxplot evaluation reward
for orientation in orientations_list:
generate_reward_boxplots(orientation=str(orientation), data_dir=data_dir + "/boxplot/eval/" + orientation + "/",
saving_dir=saving_dir + "/boxplot/eval/" + orientation + "/")
print("Writing to experiment info file...")
if all_saving_dirs is not None:
create_info_file(num_success, num_total, all_saving_dirs, text)
def rl_experiment(exp_num, exp_name, prev_exp_dir, requested_shapes, requested_orientation_list, all_saving_dirs):
""" Train policy according to RL experiment shape, size, orientation combo + stage """
# Fill object list using latin square method
env.Generate_Latin_Square(args.max_episode, "objects.csv", shape_keys=requested_shapes)
# Get policy name based on what is saved at file
pol_dir = prev_exp_dir + "/policy/"
policy_filename_path = glob.glob(pol_dir+"*_actor_optimizer")
policy_filename = os.path.basename(policy_filename_path[0])
policy_filename = policy_filename.replace('_actor_optimizer', '')
# Load Pre-Trained policy
policy.load(pol_dir+policy_filename)
# *** Train policy ****
train_model_save_path, eval_num_success, eval_num_total = train_policy(args.max_episode, args.expert_prob, all_saving_dirs)
print("Experiment ", exp_num, ", ", exp_name, " policy saved at: ", train_model_save_path)
# Save train agent replay data
replay_filename = exp_dir + "/replay_buffer"
agent_replay_save_path = replay_buffer.save_replay_buffer(replay_filename)
print("Agent replay buffer saved at: ", agent_replay_save_path)
# Produce plots and output info file
output_dir = exp_dir + "/output/"
saving_dir = output_dir
grasp_text = ""
if args.with_grasp_reward is True:
grasp_text = "WITH grasp"
else:
grasp_text = "NO grasp"
exp_text = grasp_text + " Experiment " + str(exp_num) + ": " + exp_name + ", Stage " + str(
exp_stage) + "\nDate: {}".format(datetime.datetime.now().strftime("%m_%d_%y_%H%M"))
previous_text = "Previous experiment: " + prev_exp_name
type_text = "Experiment shapes: " + str(requested_shapes) + "\nExperiment orientation: " + str(
requested_orientation)
success_text = "Final Policy Evaluation:\n# Success: " + str(eval_num_success) + "\n# Failures: " + str(
eval_num_total-eval_num_success) + "\n# Total: " + str(eval_num_total)
output_text = "Output directory: " + str(exp_dir)
text = exp_text + "\n" + previous_text + "\n" + type_text + "\n" + success_text + "\n" + output_text
# Create unique info file
f = open(all_saving_dirs["output_dir"] + "/"+str(args.mode)+"_info.txt", "w")
f.write(text)
f.close()
# Generate output plots and info file, all saving dirs set to none as it has a unique info file
generate_output(text=text, data_dir=output_dir, orientations_list=requested_orientation_list, saving_dir=saving_dir, num_success=eval_num_success, num_total=eval_num_total, all_saving_dirs=all_saving_dirs)
print("--------------------------------------------------")
print("Finished Experiment!")
print(output_text)
print(previous_text)
print(type_text)
print(success_text)
print(output_text)
def create_info_file(num_success,num_total,all_saving_dirs,extra_text=""):
""" Create text file containing information about the current training run """
# INFO FILE
# HEADER
# Name: Mode and Policy name
# Date: Month, Day, Year, Time
# Saving directory: Main saving directory name
header_text = "Name: {}\nDate: {}\nSaving Dir: {}".format(all_saving_dirs["saving_dir"], datestr, all_saving_dirs["saving_dir"])
# INPUT:
# Policy Initialization: None (Random init) or pre-train filepath
# Expert Replay Buffer: None or filepath
# Agent Replay Buffer: None or filepath
# Object/hand pose initial coordinate text file path
input_text = "\n\nINPUT:\nPolicy dir: {}\nExpert Replay Buffer: {}\nAgent Replay Buffer: {}\nInitial Object/Hand Pose Coord. File (Controller): {}\nInitial Object/Hand Pose Coord. File (Train): {}\nInitial Object/Hand Pose Coord. File (Evaluation): {}".format(all_saving_dirs["pretrain_model_save_path"], all_saving_dirs["expert_replay_file_path"], all_saving_dirs["agent_replay_file_path"], all_saving_dirs["controller_init_coord_file_path"], all_saving_dirs["train_init_coord_file_path"], all_saving_dirs["eval_init_coord_file_path"])
# OUTPUT:
# Policy: None or model_save_path
# Agent Replay Buffer: None or filepath
# Output (plotting, results): None or output_dir
# Output/ Tensorboard: None or tensorboard_dir
# Output/ Heatmap: None or heatmap_train_dir, heatmap_eval_dir
# Output/ Results: None or results_saving_dir
policy_output_text = "\n\nOUTPUT:\nPolicy: {}\nAgent Replay Buffer: {}".format(all_saving_dirs["model_save_path"],all_saving_dirs["replay_buffer"])
plotting_output_text = "\nOutput dir: {}\nOutput/ Tensorboard: {}\nOutput/ Heatmap: {}\nOutput/ Results: {}".format(all_saving_dirs["output_dir"],all_saving_dirs["tensorboard_dir"],all_saving_dirs["heatmap_train_dir"],all_saving_dirs["results_saving_dir"])
success_text = "\n---- SUCCESS INFO: ----\n# Success: {}\n# Failures: {}\n# Total: {}".format(str(num_success),str(num_total - num_success),str(num_total)) + "\n"
# ADDITIONAL TEXT/INFO (text or "") --> TRAINING PARAMETERS are included within extra_text
print("Writing to info file...")
f = open(all_saving_dirs["output_dir"] + "/"+str(args.mode)+"_info.txt", "w")
text = header_text + input_text + policy_output_text + plotting_output_text + success_text + extra_text
f.write(text)
f.close()
print("\n---------------- DONE RUNNING: "+args.mode+" ---------------------")
print(text)
print("------------------------------------")
def setup_args(args=None):
""" Set important variables based on command line arguments OR passed on argument values
returns: Full set of arguments to be parsed
"""
parser = argparse.ArgumentParser()
parser.add_argument("--policy_name", default="DDPGfD") # Policy name
parser.add_argument("--env_name", default="gym_kinova_gripper:kinovagripper-v0") # OpenAI gym environment name
parser.add_argument("--seed", default=2, type=int) # Sets Gym, PyTorch and Numpy seeds
parser.add_argument("--start_timesteps", default=100, type=int) # How many time steps purely random policy is run for
parser.add_argument("--eval_freq", default=200, type=float) # How often (time steps) we evaluate
parser.add_argument("--eval_num", default=100, type=int) # Number of grasp trials to evaluate over
parser.add_argument("--max_timesteps", default=1e6, type=int) # Max time steps to run environment for
parser.add_argument("--max_episode", default=20000, type=int) # Max time steps to run environment for
parser.add_argument("--save_models", action="store_true") # Whether or not models are saved
parser.add_argument("--expl_noise", default=0.1, type=float) # Std of Gaussian exploration noise
parser.add_argument("--batch_size", default=0, type=int) # Batch size for both actor and critic - Change to be 64 for batch train, 0 for single ep sample
parser.add_argument("--discount", default=0.995, type=float) # Discount factor
parser.add_argument("--tau", default=0.0005, type=float) # Target network update rate
parser.add_argument("--policy_noise", default=0.01, type=float) # Noise added to target policy during critic update
parser.add_argument("--noise_clip", default=0.05, type=float) # Range to clip target policy noise
parser.add_argument("--policy_freq", default=2, type=int) # Frequency of delayed policy updates
parser.add_argument("--tensorboardindex", type=str, default=None) # Tensorboard log name, found in kinova_gripper_strategy/
parser.add_argument("--expert_replay_size", default=20000, type=int) # Number of episode for loading expert trajectories
parser.add_argument("--saving_dir", type=str, default=None) # Directory name to save policy within policies/
parser.add_argument("--shapes", default='CubeS', action='store', type=str) # Requested shapes to use (in format of object keys)
parser.add_argument("--hand_orientation", action='store', type=str) # Requested shapes to use (in format of object keys)
parser.add_argument("--mode", action='store', type=str, default="train") # Mode to run experiments with: (naive, position-dependent, expert, pre-train, train, rand_train, test, experiment)
parser.add_argument("--agent_replay_size", default=10000, type=int) # Maximum size of agent's replay buffer
parser.add_argument("--expert_prob", default=0.3, type=float) # Probability of sampling from expert replay buffer (opposed to agent replay buffer)
parser.add_argument("--with_grasp_reward", type=str, action='store', default="False") # bool, set True to use Grasp Reward from grasp classifier, otherwise grasp reward is 0
parser.add_argument("--save_freq", default=1000, type=int) # Frequency to save data at (Ex: every 1000 episodes, save current success/fail coords numpy array to file)
parser.add_argument("--update_after", default=100, type=int) # Start to update the policy after # episodes have occured
parser.add_argument("--update_freq", default=1, type=int) # Update the policy every # of episodes
parser.add_argument("--update_num", default=100, type=int) # Number of times to update policy per update step
parser.add_argument("--exp_num", default=None, type=int) # RL Paper: experiment number
parser.add_argument("--render_imgs", type=str, action='store', default="False") # Set to True to render video images of simulation (caution: will render each episode by default)
args = parser.parse_args()
return args
if __name__ == "__main__":
# Set up environment based on command-line arguments or passed in arguments
args = setup_args()
""" Setup the environment, state, and action space """
file_name = "%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed))
print("---------------------------------------")
print("Settings: "+file_name)
print("---------------------------------------")
# Date string to stay consistent over file naming
datestr = "_{}".format(datetime.datetime.now().strftime("%m_%d_%y_%H%M"))
# Make initial environment
env = gym.make(args.env_name)
# Set seeds for randomization
env.seed(args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
# Set dimensions for state and action spaces - policy initialization
state_dim = 82 # State dimension dependent on the length of the state space
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
max_action_trained = env.action_space.high # a vector of max actions
n = 5 # n step look ahead for the policy
wrist_lift_velocity = 0.6
min_velocity = 0.5 # Minimum velocity value for fingers or wrist
finger_lift_velocity = min_velocity
''' Set values from command line arguments '''
requested_shapes = args.shapes # Sets list of desired objects for experiment
requested_shapes = requested_shapes.split(',')
requested_orientation = args.hand_orientation # Set the desired hand orientation (normal or random)
expert_replay_size = args.expert_replay_size # Number of expert episodes for expert the replay buffer
agent_replay_size = args.agent_replay_size # Maximum number of episodes to be stored in agent replay buffer
max_num_timesteps = 30 # Maximum number of time steps within an episode
# If experiment number is selected, set mode to experiment (in case the mode has been set to train by default)
if args.exp_num is not None:
args.mode = "experiment"
# Set requested_orientation_list for directory creation, plotting and reference
if requested_orientation == "random":
requested_orientation_list = ["normal", "rotated", "top"]
else:
requested_orientation_list = ["normal"]
# Fill pre-training object list using latin square method
env.Generate_Latin_Square(args.max_episode,"objects.csv", shape_keys=requested_shapes)
kwargs = {
"state_dim": state_dim,
"action_dim": action_dim,
"max_action": max_action,
"n": n,
"discount": args.discount,
"tau": args.tau,
"batch_size": args.batch_size
}
''' Initialize policy '''
if args.policy_name == "DDPGfD":
policy = DDPGfD.DDPGfD(**kwargs)
else:
print("No such policy")
raise ValueError
# Set grasp reward based on command line input
if args.with_grasp_reward == "True" or args.with_grasp_reward == "true":
args.with_grasp_reward = True
elif args.with_grasp_reward == "False" or args.with_grasp_reward == "false":
args.with_grasp_reward = False
else:
print("with_grasp_reward must be True or False")
raise ValueError
if args.render_imgs == "True" or args.render_imgs == "true":
args.render_imgs = True
else:
args.render_imgs = False