Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GoalEnv example with Panda #52

Draft
wants to merge 4 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,27 @@

from robot_supervisor_manager import EPISODE_LIMIT, STEPS_PER_EPISODE, SAVE_MODELS_PERIOD


def run(load_path):
# Initialize supervisor object
env = PandaRobotSupervisor()

# The agent used here is trained with the DDPG algorithm (https://arxiv.org/abs/1509.02971).
# We pass (10, ) as numberOfInputs and (7, ) as numberOfOutputs, taken from the gym spaces
agent = DDPGAgent(alpha=0.000025, beta=0.00025, input_dims=[env.observation_space.shape[0]], tau=0.001, batch_size=64, layer1_size=400, layer2_size=400, n_actions=env.action_space.shape[0], load_path=load_path)

episodeCount = 0
agent = DDPGAgent(alpha=0.000025, beta=0.00025, input_dims=[
10], tau=0.001, batch_size=64, layer1_size=400, layer2_size=400, n_actions=env.action_space.shape[0], load_path=load_path)

episodeCount = 0
solved = False # Whether the solved requirement is met

# Run outer loop until the episodes limit is reached or the task is solved
while not solved and episodeCount < EPISODE_LIMIT:
state = env.reset() # Reset robot and get starting observation
env.episodeScore = 0

print("===episodeCount:", episodeCount,"===")
env.target = env.getFromDef("TARGET%s"%(random.randint(1, 10, 1)[0])) # Select one of the targets
print("===episodeCount:", episodeCount, "===")
env.target = env.getFromDef("TARGET%s" % (
random.randint(1, 10, 1)[0])) # Select one of the targets
# Inner loop is the episode loop
for step in range(STEPS_PER_EPISODE):
# In training mode the agent returns the action plus OU noise for exploration
Expand All @@ -32,54 +35,58 @@ def run(load_path):
# the done condition
newState, reward, done, info = env.step(act*0.032)
# process of negotiation
while(newState==["StillMoving"]):
while(newState["observation"] == ["StillMoving"]):
newState, reward, done, info = env.step([-1])

# Save the current state transition in agent's memory
agent.remember(state, act, reward, newState, int(done))
agent.remember(state, act, reward,
newState["observation"], int(done))

env.episodeScore += reward # Accumulate episode reward
# Perform a learning step
if done or step==STEPS_PER_EPISODE-1:
if done or step == STEPS_PER_EPISODE-1:
# Save the episode's score
env.episodeScoreList.append(env.episodeScore)
agent.learn()
if episodeCount%SAVE_MODELS_PERIOD==0:
if episodeCount % SAVE_MODELS_PERIOD == 0:
agent.save_models()
solved = env.solved() # Check whether the task is solved
break

state = newState # state for next step is current step's newState
# state for next step is current step's newState
state = newState["observation"]

print("Episode #", episodeCount, "score:", env.episodeScore)
fp = open("./exports/Episode-score.txt","a")
fp = open("./exports/Episode-score.txt", "a")
fp.write(str(env.episodeScore)+'\n')
fp.close()
episodeCount += 1 # Increment episode counter

agent.save_models()
if not solved:
print("Reached episode limit and task was not solved, deploying agent for testing...")
print(
"Reached episode limit and task was not solved, deploying agent for testing...")
else:
print("Task is solved, deploying agent for testing...")

state = env.reset()
env.episodeScore = 0
step = 0
env.target = env.getFromDef("TARGET%s"%(random.randint(1, 10, 1)[0])) # Select one of the targets
env.target = env.getFromDef("TARGET%s" % (
random.randint(1, 10, 1)[0])) # Select one of the targets
while True:
act = agent.choose_action_test(state)
state, reward, done, _ = env.step(act*0.032)
# process of negotiation
while(state==["StillMoving"]):
while(state == ["StillMoving"]):
state, reward, done, info = env.step([-1])

env.episodeScore += reward # Accumulate episode reward
step = step + 1
if done or step==STEPS_PER_EPISODE-1:
if done or step == STEPS_PER_EPISODE-1:
print("Reward accumulated =", env.episodeScore)
env.episodeScore = 0
state = env.reset()
step = 0
env.target = env.getFromDef("TARGET%s"%(random.randint(1, 10, 1)[0]))

env.target = env.getFromDef(
"TARGET%s" % (random.randint(1, 10, 1)[0]))
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name,
T.nn.init.uniform_(self.q.bias.data, -f3, f3)

self.optimizer = optim.Adam(self.parameters(), lr=beta)
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

self.to(self.device)

Expand Down Expand Up @@ -142,7 +142,7 @@ def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name,
T.nn.init.uniform_(self.mu.bias.data, -f3, f3)

self.optimizer = optim.Adam(self.parameters(), lr=alpha)
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

self.to(self.device)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
from deepbots.supervisor.controllers.robot_supervisor import RobotSupervisor
from gym.spaces import Box, Discrete
from robot_supervisor_manager import STEPS_PER_EPISODE, MOTOR_VELOCITY
from deepbots.supervisor.controllers.robot_supervisor import RobotGoalSupervisor
from gym import spaces
import numpy as np
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..')) #
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from ArmUtil import Func, ToArmCoord
from robot_supervisor_manager import STEPS_PER_EPISODE, MOTOR_VELOCITY

class PandaRobotSupervisor(RobotSupervisor):

class PandaRobotSupervisor(RobotGoalSupervisor):
"""
Observation:
desired_goal : the goal that the agent should attempt to achieve

Type: Box(10)
Num Observation Min(rad) Max(rad)
0 Target x -Inf Inf
1 Target y -Inf Inf
2 Target z -Inf Inf

achieved_goal: the goal that it currently achieved

Type: Box(10)
Num Observation Min(rad) Max(rad)
0 Target x -Inf Inf
1 Target y -Inf Inf
2 Target z -Inf Inf

observation : the actual observations of the environment as per usual

Type: Box(10)
Num Observation Min(rad) Max(rad)
0 Target x -Inf Inf
Expand All @@ -21,7 +41,7 @@ class PandaRobotSupervisor(RobotSupervisor):
7 Position Sensor on A5 -2.8972 2.8972
8 Position Sensor on A6 -0.0175 3.7525
9 Position Sensor on A7 -2.8972 2.8972

Actions:
Type: Continuous
Num Min Max Desc
Expand All @@ -45,32 +65,61 @@ def __init__(self):
"""

super().__init__()
self.obsshape = {
"achieved_goal": 3,
"observation": 7
}

# Set up gym spaces
self.observation_space = Box(low=np.array([-np.inf, -np.inf, -np.inf, -2.8972, -1.7628, -2.8972, -3.0718, -2.8972, -0.0175, -2.8972]),
high=np.array([np.inf, np.inf, np.inf, 2.8972, 1.7628, 2.8972, -0.0698, 2.8972, 3.7525, 2.8972]),
dtype=np.float64)
self.action_space = Box(low=np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]), high=np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), dtype=np.float64)
self.observation_space = spaces.Dict(
dict(
desired_goal=spaces.Box(
low=np.array([-np.inf, -np.inf, -np.inf]),
high=np.array([np.inf, np.inf, np.inf]),
dtype=np.float64
),
achieved_goal=spaces.Box(
low=np.array([-np.inf, -np.inf, -np.inf]),
high=np.array([np.inf, np.inf, np.inf]),
dtype=np.float64
),
observation=spaces.Box(
low=np.array([-np.inf, -np.inf, -np.inf, -2.8972, -
1.7628, -2.8972, -3.0718, -2.8972, -0.0175, -2.8972]),
high=np.array([np.inf, np.inf, np.inf, 2.8972, 1.7628,
2.8972, -0.0698, 2.8972, 3.7525, 2.8972]),
dtype=np.float64
),
)
)

self.action_space = spaces.Box(
low=np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]),
high=np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
dtype=np.float64)
# Set up various robot components
self.robot = self.getSelf() # Grab the robot reference from the supervisor to access various robot methods
self.positionSensorList = Func.get_All_positionSensors(self, self.timestep)
# Grab the robot reference from the supervisor to access various robot methods
self.robot = self.getSelf()
self.positionSensorList = Func.get_All_positionSensors(
self, self.timestep)
self.endEffector = self.getFromDef("endEffector")

# Select one of the targets
self.target = self.getFromDef("TARGET%s"%(np.random.randint(1, 10, 1)[0]))
self.target = self.getFromDef(
"TARGET%s" % (np.random.randint(1, 10, 1)[0]))

self.setup_motors()

# Set up misc
self.episodeScore = 0 # Score accumulated during an episode
self.episodeScoreList = [] # A list to save all the episode scores, used to check if task is solved

# A list to save all the episode scores, used to check if task is solved
self.episodeScoreList = []

# Set these to ensure that the robot stops moving
self.motorPositionArr = np.zeros(7)
self.motorPositionArr_target = np.zeros(7)
self.distance = float("inf")

# handshaking limit
self.cnt_handshaking = 0

Expand All @@ -84,20 +133,33 @@ def get_observations(self):
"""
# process of negotiation
prec = 0.0001
err = np.absolute(np.array(self.motorPositionArr)-np.array(self.motorPositionArr_target)) < prec
if not np.all(err) and self.cnt_handshaking<20:
err = np.absolute(np.array(self.motorPositionArr) -
np.array(self.motorPositionArr_target)) < prec
message = None
if not np.all(err) and self.cnt_handshaking < 20:
self.cnt_handshaking = self.cnt_handshaking + 1
return ["StillMoving"]
message = ["StillMoving"]
else:
self.cnt_handshaking = 0
# ----------------------


# observation and desired_goal
targetPosition = ToArmCoord.convert(self.target.getPosition())
message = [i for i in targetPosition]
message.extend([i for i in self.motorPositionArr])
return message

def get_reward(self, action):
if message == None:
message = [i for i in targetPosition]
message.extend([i for i in self.motorPositionArr])

# achieved_goal
endEffectorPosition = ToArmCoord.convert(
self.endEffector.getPosition())

return {
"observation": message,
"achieved_goal": endEffectorPosition,
"desired_goal": targetPosition,
}

def compute_reward(self, achieved_goal, desired_goal, info):
"""
Reward is - 2-norm for every step taken (extra points for getting close enough to the target)

Expand All @@ -106,15 +168,10 @@ def get_reward(self, action):
:return: - 2-norm (+ extra points)
:rtype: float
"""
targetPosition = self.target.getPosition()
targetPosition = ToArmCoord.convert(targetPosition)

endEffectorPosition = self.endEffector.getPosition()
endEffectorPosition = ToArmCoord.convert(endEffectorPosition)
self.distance = np.linalg.norm(
[desired_goal[0]-achieved_goal[0], desired_goal[1]-achieved_goal[1], desired_goal[2]-achieved_goal[2]])
reward = -self.distance # - 2-norm

self.distance = np.linalg.norm([targetPosition[0]-endEffectorPosition[0],targetPosition[1]-endEffectorPosition[1],targetPosition[2]-endEffectorPosition[2]])
reward = -self.distance # - 2-norm

# Extra points
if self.distance < 0.01:
reward = reward + 1.5
Expand Down Expand Up @@ -145,7 +202,8 @@ def solved(self):
:rtype: bool
"""
if len(self.episodeScoreList) > 500: # Over 500 trials thus far
if np.mean(self.episodeScoreList[-500:]) > 120.0: # Last 500 episode scores average value
# Last 500 episode scores average value
if np.mean(self.episodeScoreList[-500:]) > 120.0:
return True
return False

Expand All @@ -156,24 +214,24 @@ def get_default_observation(self):
:return: Starting observation zero vector
:rtype: list
"""
Obs = [0.0 for _ in range(self.observation_space.shape[0])]
Obs = [0.0 for _ in range(10)]
Obs[3] = -0.0698
return Obs

def motorToRange(self, motorPosition, i):
if(i==0):
if(i == 0):
motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
elif(i==1):
elif(i == 1):
motorPosition = np.clip(motorPosition, -1.7628, 1.7628)
elif(i==2):
elif(i == 2):
motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
elif(i==3):
elif(i == 3):
motorPosition = np.clip(motorPosition, -3.0718, -0.0698)
elif(i==4):
elif(i == 4):
motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
elif(i==5):
elif(i == 5):
motorPosition = np.clip(motorPosition, -0.0175, 3.7525)
elif(i==6):
elif(i == 6):
motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
else:
pass
Expand All @@ -188,20 +246,23 @@ def apply_action(self, action):
:type action: list of float
"""
# ignore this action and keep moving
if action[0]==-1 and len(action)==1:
if action[0] == -1 and len(action) == 1:
for i in range(7):
self.motorPositionArr[i] = self.positionSensorList[i].getValue()
self.motorPositionArr[i] = self.positionSensorList[i].getValue(
)
self.motorList[i].setVelocity(MOTOR_VELOCITY)
self.motorList[i].setPosition(self.motorPositionArr_target[i])
return

self.motorPositionArr = np.array(Func.getValue(self.positionSensorList))

self.motorPositionArr = np.array(
Func.getValue(self.positionSensorList))
for i in range(7):
motorPosition = self.motorPositionArr[i] + action[i]
motorPosition = self.motorToRange(motorPosition, i)
self.motorList[i].setVelocity(MOTOR_VELOCITY)
self.motorList[i].setPosition(motorPosition)
self.motorPositionArr_target[i]=motorPosition # Update motorPositionArr_target
# Update motorPositionArr_target
self.motorPositionArr_target[i] = motorPosition

def setup_motors(self):
"""
Expand Down