Created 3 trianing configs #4

drkostas · Dec 12, 2022 · a4f18f0 · a4f18f0
1 parent 0bd15bd
commit a4f18f0
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 18 deletions.
diff --git a/RL_Agent_Basic.py b/RL_Agent_Basic.py
@@ -8,6 +8,8 @@
 from ray.rllib.models.utils import get_activation_fn, get_filter_config
 from RLcraft import MalmoMazeEnv
 import numpy as np
+import os
+import time
 #from ray.rllib.agents.ppo import PPO
 
 from ray.rllib.algorithms.ppo import PPO
@@ -41,24 +43,24 @@ def __init__(self, config: Dict):
                                                        3),
                                                 dtype=np.float32)
         self.action_space = gym.spaces.Discrete(len(self.env.action_space))
-        print(self.action_space)
 
     def reset(self):
         x = self.env.reset()
         return x
 
     def step(self, action):
-        print(action)
         x = self.env.step(action)
-
+        # TODO: Option to use the observations from the info (next 2 lines)
+        # observations = self.process_obs(x[0], x[3])
+        # reward = x[1]
         info = {
             # "obs":x[3].observations,
             # "rewards":x[3].rewards,
             # "frames":x[3].number_of_video_frames_since_last_state,
             "rewards": x[3].rewards[0].getValue()
         }
 
-        return x[0], x[1], x[2], info
+        return x[0], x[1], x[2], info  # TODO: Is this structured required by rrllib or can we change it?
 
     @staticmethod
     def process_obs(np_obs, info):
@@ -84,6 +86,30 @@ def process_obs(np_obs, info):
         obs['hp'] = hp_data  # Eg: 20.0 (max)
         return obs
 
+    # TODO: We should use a variation of this to encode the block observations
+    # @staticmethod
+    # def gridProcess(state):
+    #     msg = state.observations[-1].text
+    #     observations = json.loads(msg)
+    #     grid = observations.get(u'floor10x10', 0)
+    #     Xpos = observations.get(u'XPos', 0)
+    #     Zpos = observations.get(u'ZPos', 0)
+    #     obs = np.array(grid)
+    #     obs = np.reshape(obs, [16, 16, 1])
+    #     obs[(int)(5 + Zpos)][ (int)(10 + Xpos)] = "human"
+
+    #     # for i in range(obs.shape[0]):
+    #     #     for j in range(obs.shape[1]):
+    #     #         if obs[i,j] ==""
+    #     obs[obs == "carpet"] = 0
+    #     obs[obs == "sea_lantern"] = 1
+    #     obs[obs == "human"] = 3
+    #     obs[obs == "fire"] = 4
+    #     obs[obs == "emerald_block"] = 5
+    #     obs[obs == "beacon"] = 6
+    #     obs[obs == "air"] = 7
+    #     # print("Here is obs", obs)
+    #     return obs
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -102,11 +128,16 @@ def get_args():
 def get_train_name(name, c):
     """ Get the name of the training session. """
     e = c['env_config']
+    actions = []
+    for a in e['action_space']:
+        a = str(a).replace(' ', '')
+        actions.append(a)
+    actions = '+'.join(actions)
     hiddens = [str(h) for h in c['model']['fcnet_hiddens']]
     hiddens = '+'.join(hiddens)
     name = f"{name}_{e['width']}width_{e['millisec_per_tick']}ticks_"\
            f"{e['mission_timeout_ms']}timeout_{e['step_reward']}step_"\
-           f"{e['win_reward']}win_{e['lose_reward']}lose_{len(e['action_space'])}actions_"\
+           f"{e['win_reward']}win_{e['lose_reward']}lose_{actions}actions_"\
            f"{e['time_wait']}wait_{e['max_loop']}loop_{hiddens}hiddens"
     return name
 
@@ -119,16 +150,42 @@ def main():
     c = Configuration(config_src=args.config_file)
     # Load configs from config class
     general_config = c.get_config('general')['config']
-    train_config = c.get_config('train')[0]['config']
-    env_config = train_config['env_config']
-    height, width = env_config['height'], env_config['width']
-    train_config['model']['conv_filters'] = (height, width, 3)
-    train_name = get_train_name(name=general_config['name'], c=train_config)
-    print("Training session name: ", train_name)
-
-    algo = PPO(env=CustomEnv, config=train_config)
-    for _ in range(5):
-        (algo.train())
+    train_configs = c.get_config('train')
+
+    for train_config in train_configs:
+        print()
+        print("# ------ New Training ------ #")
+        train_config = train_config['config']
+        env_config = train_config['env_config']
+
+        # Set the name of the training agent
+        height, width = env_config['height'], env_config['width']
+        train_config['model']['conv_filters'] = (height, width, 3)
+        train_name = get_train_name(name=general_config['name'], c=train_config)
+        print("Training session name: ", train_name)
+
+        # Create checkpoint directory
+        save_freq = general_config['save_freq']
+        checkpoint_path = os.path.join(general_config['checkpoint_path'], train_name)
+        os.makedirs(checkpoint_path, exist_ok=True)
+
+        # Create the environment
+        algo = PPO(env=CustomEnv, config=train_config)
+
+        # Train the agent
+        train_epochs = int(general_config['train_epochs'])
+        start_time = time.time()
+        last_eval = 0
+        for epoch in range(train_epochs):
+            info = algo.train()  # TODO: Is the info the output of the step?
+
+            if epoch % save_freq == 0:
+                print(f"Ran {(time.time()-start_time)/60:0.1f} minutes")
+                last_eval = time.time()
+                algo.save_checkpoint(checkpoint_path)
+                print(f"Checkpoint saved.")
+                print(f"{(time.time()-start_time)/60:0.1f} minutes elapsed.")
+                # TODO: Also print the average, min, max reward, (and loss??)
 
 
 if __name__ == '__main__':

diff --git a/configs/mazes.yml b/configs/mazes.yml
@@ -4,14 +4,17 @@ general:
     name: Mazes
     log_path: ./logs
     checkpoint_path: ./checkpoints
-train:
+    save_freq: 5
+    train_epochs: 30
+train:  # Every config is a different training agent
+  # --- Agent 1 --- #
   - config:
       env_config:
         xml: missions/mazes/maze*.xml
         width: 84
         height: 84
         millisec_per_tick: 50
-        maze_seed: 1  # will be replaced by the agent
+        maze_seed: 1  # will be replaced by the code
         mission_timeout_ms: 300000 # This is a good timeout for the maze
         step_reward: -1
         win_reward: 85
@@ -32,7 +35,73 @@ train:
         fcnet_hiddens:
           - 64
           - 64
-        conv_filters: null  # will be replaced by the agent
+        conv_filters: null  # will be replaced by the code
+        fcnet_activation: relu
+        grayscale: True
+        evaluation_num_workers: 1
+        disable_env_checking: True
+        evaluation_config:
+          render_env: False
+  # --- Agent 2 --- # [No Turning left or right]
+  - config:
+      env_config:
+        xml: missions/mazes/maze*.xml
+        width: 84
+        height: 84
+        millisec_per_tick: 50
+        maze_seed: 1  # will be replaced by the code
+        mission_timeout_ms: 300000 # This is a good timeout for the maze
+        step_reward: -1
+        win_reward: 85
+        lose_reward: -100
+        action_space:  # The order of the actions is rettained in the action space
+          - move 1  # move forward
+          - move -1  # move backward
+          - strafe 1  # move right
+          - strafe -1  # move left
+        client_port: 10000  # malmo port
+        time_wait: 0.05  # time to wait for retreiving world state (when MsPerTick=20)
+        max_loop: 50  # wait till TIME_WAIT * MAX_LOOP seconds for each action
+      num_workers: 1
+      framework: tf
+      model:
+        fcnet_hiddens:
+          - 64
+          - 64
+        conv_filters: null  # will be replaced by the code
+        fcnet_activation: relu
+        grayscale: True
+        evaluation_num_workers: 1
+        disable_env_checking: True
+        evaluation_config:
+          render_env: False
+  # --- Agent 3 --- # [No side ways moving (should turn instead)]
+  - config:
+      env_config:
+        xml: missions/mazes/maze*.xml
+        width: 84
+        height: 84
+        millisec_per_tick: 50
+        maze_seed: 1  # will be replaced by the code
+        mission_timeout_ms: 300000 # This is a good timeout for the maze
+        step_reward: -1
+        win_reward: 85
+        lose_reward: -100
+        action_space:  # The order of the actions is rettained in the action space
+          - move 1  # move forward
+          - move -1  # move backward
+          - turn 1  # turn right
+          - turn -1  # turn left
+        client_port: 10000  # malmo port
+        time_wait: 0.05  # time to wait for retreiving world state (when MsPerTick=20)
+        max_loop: 50  # wait till TIME_WAIT * MAX_LOOP seconds for each action
+      num_workers: 1
+      framework: tf
+      model:
+        fcnet_hiddens:
+          - 64
+          - 64
+        conv_filters: null  # will be replaced by the code
         fcnet_activation: relu
         grayscale: True
         evaluation_num_workers: 1