Skip to content

Commit

Permalink
Created 3 trianing configs #4
Browse files Browse the repository at this point in the history
  • Loading branch information
drkostas committed Dec 12, 2022
1 parent 0bd15bd commit a4f18f0
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 18 deletions.
87 changes: 72 additions & 15 deletions RL_Agent_Basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from ray.rllib.models.utils import get_activation_fn, get_filter_config
from RLcraft import MalmoMazeEnv
import numpy as np
import os
import time
#from ray.rllib.agents.ppo import PPO

from ray.rllib.algorithms.ppo import PPO
Expand Down Expand Up @@ -41,24 +43,24 @@ def __init__(self, config: Dict):
3),
dtype=np.float32)
self.action_space = gym.spaces.Discrete(len(self.env.action_space))
print(self.action_space)

def reset(self):
x = self.env.reset()
return x

def step(self, action):
print(action)
x = self.env.step(action)

# TODO: Option to use the observations from the info (next 2 lines)
# observations = self.process_obs(x[0], x[3])
# reward = x[1]
info = {
# "obs":x[3].observations,
# "rewards":x[3].rewards,
# "frames":x[3].number_of_video_frames_since_last_state,
"rewards": x[3].rewards[0].getValue()
}

return x[0], x[1], x[2], info
return x[0], x[1], x[2], info # TODO: Is this structured required by rrllib or can we change it?

@staticmethod
def process_obs(np_obs, info):
Expand All @@ -84,6 +86,30 @@ def process_obs(np_obs, info):
obs['hp'] = hp_data # Eg: 20.0 (max)
return obs

# TODO: We should use a variation of this to encode the block observations
# @staticmethod
# def gridProcess(state):
# msg = state.observations[-1].text
# observations = json.loads(msg)
# grid = observations.get(u'floor10x10', 0)
# Xpos = observations.get(u'XPos', 0)
# Zpos = observations.get(u'ZPos', 0)
# obs = np.array(grid)
# obs = np.reshape(obs, [16, 16, 1])
# obs[(int)(5 + Zpos)][ (int)(10 + Xpos)] = "human"

# # for i in range(obs.shape[0]):
# # for j in range(obs.shape[1]):
# # if obs[i,j] ==""
# obs[obs == "carpet"] = 0
# obs[obs == "sea_lantern"] = 1
# obs[obs == "human"] = 3
# obs[obs == "fire"] = 4
# obs[obs == "emerald_block"] = 5
# obs[obs == "beacon"] = 6
# obs[obs == "air"] = 7
# # print("Here is obs", obs)
# return obs

def get_args():
parser = argparse.ArgumentParser()
Expand All @@ -102,11 +128,16 @@ def get_args():
def get_train_name(name, c):
""" Get the name of the training session. """
e = c['env_config']
actions = []
for a in e['action_space']:
a = str(a).replace(' ', '')
actions.append(a)
actions = '+'.join(actions)
hiddens = [str(h) for h in c['model']['fcnet_hiddens']]
hiddens = '+'.join(hiddens)
name = f"{name}_{e['width']}width_{e['millisec_per_tick']}ticks_"\
f"{e['mission_timeout_ms']}timeout_{e['step_reward']}step_"\
f"{e['win_reward']}win_{e['lose_reward']}lose_{len(e['action_space'])}actions_"\
f"{e['win_reward']}win_{e['lose_reward']}lose_{actions}actions_"\
f"{e['time_wait']}wait_{e['max_loop']}loop_{hiddens}hiddens"
return name

Expand All @@ -119,16 +150,42 @@ def main():
c = Configuration(config_src=args.config_file)
# Load configs from config class
general_config = c.get_config('general')['config']
train_config = c.get_config('train')[0]['config']
env_config = train_config['env_config']
height, width = env_config['height'], env_config['width']
train_config['model']['conv_filters'] = (height, width, 3)
train_name = get_train_name(name=general_config['name'], c=train_config)
print("Training session name: ", train_name)

algo = PPO(env=CustomEnv, config=train_config)
for _ in range(5):
(algo.train())
train_configs = c.get_config('train')

for train_config in train_configs:
print()
print("# ------ New Training ------ #")
train_config = train_config['config']
env_config = train_config['env_config']

# Set the name of the training agent
height, width = env_config['height'], env_config['width']
train_config['model']['conv_filters'] = (height, width, 3)
train_name = get_train_name(name=general_config['name'], c=train_config)
print("Training session name: ", train_name)

# Create checkpoint directory
save_freq = general_config['save_freq']
checkpoint_path = os.path.join(general_config['checkpoint_path'], train_name)
os.makedirs(checkpoint_path, exist_ok=True)

# Create the environment
algo = PPO(env=CustomEnv, config=train_config)

# Train the agent
train_epochs = int(general_config['train_epochs'])
start_time = time.time()
last_eval = 0
for epoch in range(train_epochs):
info = algo.train() # TODO: Is the info the output of the step?

if epoch % save_freq == 0:
print(f"Ran {(time.time()-start_time)/60:0.1f} minutes")
last_eval = time.time()
algo.save_checkpoint(checkpoint_path)
print(f"Checkpoint saved.")
print(f"{(time.time()-start_time)/60:0.1f} minutes elapsed.")
# TODO: Also print the average, min, max reward, (and loss??)


if __name__ == '__main__':
Expand Down
75 changes: 72 additions & 3 deletions configs/mazes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ general:
name: Mazes
log_path: ./logs
checkpoint_path: ./checkpoints
train:
save_freq: 5
train_epochs: 30
train: # Every config is a different training agent
# --- Agent 1 --- #
- config:
env_config:
xml: missions/mazes/maze*.xml
width: 84
height: 84
millisec_per_tick: 50
maze_seed: 1 # will be replaced by the agent
maze_seed: 1 # will be replaced by the code
mission_timeout_ms: 300000 # This is a good timeout for the maze
step_reward: -1
win_reward: 85
Expand All @@ -32,7 +35,73 @@ train:
fcnet_hiddens:
- 64
- 64
conv_filters: null # will be replaced by the agent
conv_filters: null # will be replaced by the code
fcnet_activation: relu
grayscale: True
evaluation_num_workers: 1
disable_env_checking: True
evaluation_config:
render_env: False
# --- Agent 2 --- # [No Turning left or right]
- config:
env_config:
xml: missions/mazes/maze*.xml
width: 84
height: 84
millisec_per_tick: 50
maze_seed: 1 # will be replaced by the code
mission_timeout_ms: 300000 # This is a good timeout for the maze
step_reward: -1
win_reward: 85
lose_reward: -100
action_space: # The order of the actions is rettained in the action space
- move 1 # move forward
- move -1 # move backward
- strafe 1 # move right
- strafe -1 # move left
client_port: 10000 # malmo port
time_wait: 0.05 # time to wait for retreiving world state (when MsPerTick=20)
max_loop: 50 # wait till TIME_WAIT * MAX_LOOP seconds for each action
num_workers: 1
framework: tf
model:
fcnet_hiddens:
- 64
- 64
conv_filters: null # will be replaced by the code
fcnet_activation: relu
grayscale: True
evaluation_num_workers: 1
disable_env_checking: True
evaluation_config:
render_env: False
# --- Agent 3 --- # [No side ways moving (should turn instead)]
- config:
env_config:
xml: missions/mazes/maze*.xml
width: 84
height: 84
millisec_per_tick: 50
maze_seed: 1 # will be replaced by the code
mission_timeout_ms: 300000 # This is a good timeout for the maze
step_reward: -1
win_reward: 85
lose_reward: -100
action_space: # The order of the actions is rettained in the action space
- move 1 # move forward
- move -1 # move backward
- turn 1 # turn right
- turn -1 # turn left
client_port: 10000 # malmo port
time_wait: 0.05 # time to wait for retreiving world state (when MsPerTick=20)
max_loop: 50 # wait till TIME_WAIT * MAX_LOOP seconds for each action
num_workers: 1
framework: tf
model:
fcnet_hiddens:
- 64
- 64
conv_filters: null # will be replaced by the code
fcnet_activation: relu
grayscale: True
evaluation_num_workers: 1
Expand Down

0 comments on commit a4f18f0

Please sign in to comment.