-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
137 lines (109 loc) · 3.76 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 5 13:19:30 2023
@author: lukeshao
"""
import torch
# Code for registering the environment and running pygame window of grid_world
import gym
from gym import spaces
import pygame
import numpy as np
import random
from gym.envs.registration import register
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tensor = torch.tensor([1.0, 2.0]).to(device)
print(torch.__version__)
print(torch.cuda.is_available())
#sets max steps for both register() and training agent
max_steps = 150
register(
id='gym_examples/GridWorld-v0',
entry_point='gym_examples.envs:GridWorldEnv',
max_episode_steps=max_steps,
)
env = gym.make("gym_examples/GridWorld-v0", render_mode="human", size=10)
'''
x1 = int(input("Set first obstacle location: first integer value in ordered pair: "))
y1 = int(input("Set first obstacle location: second integer value in ordered pair: "))
x2 = int(input("Set second obstacle location: first integer value in ordered pair: "))
y2 = int(input("Set second obstacle location: second integer value in ordered pair: "))
env.set_obstacles(x1, y1, x2, y2)
'''
env.set_obstacles_hospital()
observation, info = env.reset()
state_size = env.get_state_size()
#print(state_size)
action_size = env.action_space.n
#print(action_size)
size = env.get_size()
#print(size)
states = env.get_states()
#print(states)
alpha = 0.9 # learning rate
gamma = 0.99 # discount rate
epsilon = 1.0 # probability that our agent will explore
decay_rate = 0.001 # of epsilon
q = np.zeros([state_size, action_size])
# training variables
num_episodes = 1000
max_steps_training = max_steps # per episode
for episode in range(num_episodes):
# reset the environment
observation, info = env.reset()
pairTuple = (tuple(observation["agent"]), env.get_is_picked_up())
state = states[pairTuple]
terminated = False
truncated = False
#print(f"Current State {state}")
for s in range(max_steps_training):
# exploration-exploitation tradeoff
if random.uniform(0,1) < epsilon:
# explore
action = env.action_space.sample()
else:
# exploit
action = np.argmax(q[state,:])
# epsilon decreases exponentially --> our agent will explore less and less
epsilon = np.exp(-decay_rate * episode)
#print(f"Epsilon: {epsilon}")
# take action and observe reward
#print(f"Action: {action}")
observation, reward, terminated, truncated, info = env.step(action)
#print(observation)
pairTuple = (tuple(observation["agent"]), env.get_is_picked_up())
#print(pairTuple)
new_state = states[pairTuple]
#print(f"New State {new_state}")
#print(f"Reward: {reward}")
# Q-learning algorithm
q[state,action] = q[state,action] + alpha * (reward + gamma * np.max(q[new_state,:])-q[state,action])
# Update to our new state
state = new_state
if terminated or truncated:
break
#print(q)
env.trained()
observation, info = env.reset()
pairTuple = (tuple(observation["agent"]), env.get_is_picked_up())
state = states[pairTuple]
rewards = 0
for _ in range(max_steps):
action = np.argmax(q[state, :])
observation, reward, terminated, truncated, info = env.step(action)
pairTuple = (tuple(observation["agent"]), env.get_is_picked_up())
new_state = states[pairTuple]
print(f"Current State {state}")
print(f"Action: {action}")
print(observation)
print(pairTuple)
#print(f"New State {new_state}")
#print(f"Reward: {reward}")
rewards += reward
print(f"score: {rewards}")
state = new_state
if terminated or truncated:
print("Steps Taken: " + str(_+1))
break
env.close()