aidudezzz · KelvinYang0320 · Dec 12, 2021 · Jun 23, 2022 · Jun 29, 2022 · Oct 30, 2022
diff --git a/examples/panda/panda_goal_reaching/controllers/robot_supervisor_manager/DDPG_runner.py b/examples/panda/panda_goal_reaching/controllers/robot_supervisor_manager/DDPG_runner.py
@@ -5,24 +5,27 @@
 
 from robot_supervisor_manager import EPISODE_LIMIT, STEPS_PER_EPISODE, SAVE_MODELS_PERIOD
 
+
 def run(load_path):
     # Initialize supervisor object
     env = PandaRobotSupervisor()
 
     # The agent used here is trained with the DDPG algorithm (https://arxiv.org/abs/1509.02971).
     # We pass (10, ) as numberOfInputs and (7, ) as numberOfOutputs, taken from the gym spaces
-    agent = DDPGAgent(alpha=0.000025, beta=0.00025, input_dims=[env.observation_space.shape[0]], tau=0.001, batch_size=64,  layer1_size=400, layer2_size=400, n_actions=env.action_space.shape[0], load_path=load_path) 
-
-    episodeCount = 0 
+    agent = DDPGAgent(alpha=0.000025, beta=0.00025, input_dims=[
+                      10], tau=0.001, batch_size=64,  layer1_size=400, layer2_size=400, n_actions=env.action_space.shape[0], load_path=load_path)
+
+    episodeCount = 0
     solved = False  # Whether the solved requirement is met
 
     # Run outer loop until the episodes limit is reached or the task is solved
     while not solved and episodeCount < EPISODE_LIMIT:
         state = env.reset()  # Reset robot and get starting observation
         env.episodeScore = 0
 
-        print("===episodeCount:", episodeCount,"===")
-        env.target = env.getFromDef("TARGET%s"%(random.randint(1, 10, 1)[0])) # Select one of the targets
+        print("===episodeCount:", episodeCount, "===")
+        env.target = env.getFromDef("TARGET%s" % (
+            random.randint(1, 10, 1)[0]))  # Select one of the targets
         # Inner loop is the episode loop
         for step in range(STEPS_PER_EPISODE):
             # In training mode the agent returns the action plus OU noise for exploration
@@ -32,54 +35,58 @@ def run(load_path):
             # the done condition
             newState, reward, done, info = env.step(act*0.032)
             # process of negotiation
-            while(newState==["StillMoving"]):
+            while(newState["observation"] == ["StillMoving"]):
                 newState, reward, done, info = env.step([-1])
-            
+
             # Save the current state transition in agent's memory
-            agent.remember(state, act, reward, newState, int(done))
+            agent.remember(state, act, reward,
+                           newState["observation"], int(done))
 
             env.episodeScore += reward  # Accumulate episode reward
             # Perform a learning step
-            if done or step==STEPS_PER_EPISODE-1:
+            if done or step == STEPS_PER_EPISODE-1:
                 # Save the episode's score
                 env.episodeScoreList.append(env.episodeScore)
                 agent.learn()
-                if episodeCount%SAVE_MODELS_PERIOD==0:
+                if episodeCount % SAVE_MODELS_PERIOD == 0:
                     agent.save_models()
                 solved = env.solved()  # Check whether the task is solved
                 break
 
-            state = newState # state for next step is current step's newState
+            # state for next step is current step's newState
+            state = newState["observation"]
 
         print("Episode #", episodeCount, "score:", env.episodeScore)
-        fp = open("./exports/Episode-score.txt","a")
+        fp = open("./exports/Episode-score.txt", "a")
         fp.write(str(env.episodeScore)+'\n')
         fp.close()
         episodeCount += 1  # Increment episode counter
 
     agent.save_models()
     if not solved:
-        print("Reached episode limit and task was not solved, deploying agent for testing...")
+        print(
+            "Reached episode limit and task was not solved, deploying agent for testing...")
     else:
         print("Task is solved, deploying agent for testing...")
 
     state = env.reset()
     env.episodeScore = 0
     step = 0
-    env.target = env.getFromDef("TARGET%s"%(random.randint(1, 10, 1)[0])) # Select one of the targets
+    env.target = env.getFromDef("TARGET%s" % (
+        random.randint(1, 10, 1)[0]))  # Select one of the targets
     while True:
         act = agent.choose_action_test(state)
         state, reward, done, _ = env.step(act*0.032)
         # process of negotiation
-        while(state==["StillMoving"]):
+        while(state == ["StillMoving"]):
             state, reward, done, info = env.step([-1])
-        
+
         env.episodeScore += reward  # Accumulate episode reward
         step = step + 1
-        if done or step==STEPS_PER_EPISODE-1:
+        if done or step == STEPS_PER_EPISODE-1:
             print("Reward accumulated =", env.episodeScore)
             env.episodeScore = 0
             state = env.reset()
             step = 0
-            env.target = env.getFromDef("TARGET%s"%(random.randint(1, 10, 1)[0]))
-
+            env.target = env.getFromDef(
+                "TARGET%s" % (random.randint(1, 10, 1)[0]))
diff --git a/examples/panda/panda_goal_reaching/controllers/robot_supervisor_manager/agent/ddpg.py b/examples/panda/panda_goal_reaching/controllers/robot_supervisor_manager/agent/ddpg.py
@@ -89,7 +89,7 @@ def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name,
         T.nn.init.uniform_(self.q.bias.data, -f3, f3)
 
         self.optimizer = optim.Adam(self.parameters(), lr=beta)
-        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 
         self.to(self.device)
 
@@ -142,7 +142,7 @@ def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name,
         T.nn.init.uniform_(self.mu.bias.data, -f3, f3)
 
         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
-        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
+        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 
         self.to(self.device)
 

diff --git a/...s/panda/panda_goal_reaching/controllers/robot_supervisor_manager/robot_supervisor_ddpg.py b/...s/panda/panda_goal_reaching/controllers/robot_supervisor_manager/robot_supervisor_ddpg.py
@@ -1,14 +1,34 @@
-from deepbots.supervisor.controllers.robot_supervisor import RobotSupervisor
-from gym.spaces import Box, Discrete
+from robot_supervisor_manager import STEPS_PER_EPISODE, MOTOR_VELOCITY
+from deepbots.supervisor.controllers.robot_supervisor import RobotGoalSupervisor
+from gym import spaces
 import numpy as np
-import os,sys
-sys.path.insert(1, os.path.join(sys.path[0], '..')) # 
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
 from ArmUtil import Func, ToArmCoord
-from robot_supervisor_manager import STEPS_PER_EPISODE, MOTOR_VELOCITY
 
-class PandaRobotSupervisor(RobotSupervisor):
+
+class PandaRobotSupervisor(RobotGoalSupervisor):
     """
     Observation:
+        desired_goal : the goal that the agent should attempt to achieve
+
+        Type: Box(10)
+        Num	Observation                Min(rad)      Max(rad)
+        0	Target x                   -Inf           Inf
+        1	Target y                   -Inf           Inf
+        2	Target z                   -Inf           Inf
+
+        achieved_goal: the goal that it currently achieved
+
+        Type: Box(10)
+        Num	Observation                Min(rad)      Max(rad)
+        0	Target x                   -Inf           Inf
+        1	Target y                   -Inf           Inf
+        2	Target z                   -Inf           Inf
+
+        observation  : the actual observations of the environment as per usual
+
         Type: Box(10)
         Num	Observation                Min(rad)      Max(rad)
         0	Target x                   -Inf           Inf
@@ -21,7 +41,7 @@ class PandaRobotSupervisor(RobotSupervisor):
         7	Position Sensor on A5      -2.8972        2.8972
         8   Position Sensor on A6      -0.0175        3.7525
         9	Position Sensor on A7      -2.8972        2.8972
-        
+
     Actions:
         Type: Continuous
         Num	  Min   Max   Desc
@@ -45,32 +65,61 @@ def __init__(self):
         """
 
         super().__init__()
+        self.obsshape = {
+            "achieved_goal": 3,
+            "observation": 7
+        }
 
         # Set up gym spaces
-        self.observation_space = Box(low=np.array([-np.inf, -np.inf, -np.inf, -2.8972, -1.7628, -2.8972, -3.0718, -2.8972, -0.0175, -2.8972]),
-                                     high=np.array([np.inf,  np.inf,  np.inf, 2.8972,  1.7628,  2.8972, -0.0698,  2.8972,  3.7525,  2.8972]),
-                                     dtype=np.float64)
-        self.action_space = Box(low=np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]), high=np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), dtype=np.float64)
+        self.observation_space = spaces.Dict(
+            dict(
+                desired_goal=spaces.Box(
+                    low=np.array([-np.inf, -np.inf, -np.inf]),
+                    high=np.array([np.inf,  np.inf,  np.inf]),
+                    dtype=np.float64
+                ),
+                achieved_goal=spaces.Box(
+                    low=np.array([-np.inf, -np.inf, -np.inf]),
+                    high=np.array([np.inf,  np.inf,  np.inf]),
+                    dtype=np.float64
+                ),
+                observation=spaces.Box(
+                    low=np.array([-np.inf, -np.inf, -np.inf, -2.8972, -
+                                 1.7628, -2.8972, -3.0718, -2.8972, -0.0175, -2.8972]),
+                    high=np.array([np.inf,  np.inf,  np.inf, 2.8972,  1.7628,
+                                  2.8972, -0.0698,  2.8972,  3.7525,  2.8972]),
+                    dtype=np.float64
+                ),
+            )
+        )
 
+        self.action_space = spaces.Box(
+            low=np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]), 
+            high=np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]), 
+            dtype=np.float64)
         # Set up various robot components
-        self.robot = self.getSelf()  # Grab the robot reference from the supervisor to access various robot methods
-        self.positionSensorList = Func.get_All_positionSensors(self, self.timestep)
+        # Grab the robot reference from the supervisor to access various robot methods
+        self.robot = self.getSelf()
+        self.positionSensorList = Func.get_All_positionSensors(
+            self, self.timestep)
         self.endEffector = self.getFromDef("endEffector")
 
         # Select one of the targets
-        self.target = self.getFromDef("TARGET%s"%(np.random.randint(1, 10, 1)[0]))
+        self.target = self.getFromDef(
+            "TARGET%s" % (np.random.randint(1, 10, 1)[0]))
 
         self.setup_motors()
 
         # Set up misc
         self.episodeScore = 0  # Score accumulated during an episode
-        self.episodeScoreList = []  # A list to save all the episode scores, used to check if task is solved
-
+        # A list to save all the episode scores, used to check if task is solved
+        self.episodeScoreList = []
+
         # Set these to ensure that the robot stops moving
         self.motorPositionArr = np.zeros(7)
         self.motorPositionArr_target = np.zeros(7)
         self.distance = float("inf")
-        
+
         # handshaking limit
         self.cnt_handshaking = 0
 
@@ -84,20 +133,33 @@ def get_observations(self):
         """
         # process of negotiation
         prec = 0.0001
-        err = np.absolute(np.array(self.motorPositionArr)-np.array(self.motorPositionArr_target)) < prec
-        if not np.all(err) and self.cnt_handshaking<20:
+        err = np.absolute(np.array(self.motorPositionArr) -
+                          np.array(self.motorPositionArr_target)) < prec
+        message = None
+        if not np.all(err) and self.cnt_handshaking < 20:
             self.cnt_handshaking = self.cnt_handshaking + 1
-            return ["StillMoving"]
+            message = ["StillMoving"]
         else:
             self.cnt_handshaking = 0
-        # ----------------------
-
+
+        # observation and desired_goal
         targetPosition = ToArmCoord.convert(self.target.getPosition())
-        message = [i for i in targetPosition]
-        message.extend([i for i in self.motorPositionArr])
-        return message
 
-    def get_reward(self, action):
+        if message == None:
+            message = [i for i in targetPosition]
+            message.extend([i for i in self.motorPositionArr])
+
+        # achieved_goal
+        endEffectorPosition = ToArmCoord.convert(
+            self.endEffector.getPosition())
+
+        return {
+            "observation": message,
+            "achieved_goal": endEffectorPosition,
+            "desired_goal": targetPosition,
+        }
+
+    def compute_reward(self, achieved_goal, desired_goal, info):
         """
         Reward is - 2-norm for every step taken (extra points for getting close enough to the target)
 
@@ -106,15 +168,10 @@ def get_reward(self, action):
         :return: - 2-norm (+ extra points)
         :rtype: float
         """
-        targetPosition = self.target.getPosition()
-        targetPosition = ToArmCoord.convert(targetPosition)
-
-        endEffectorPosition = self.endEffector.getPosition()
-        endEffectorPosition = ToArmCoord.convert(endEffectorPosition)
+        self.distance = np.linalg.norm(
+            [desired_goal[0]-achieved_goal[0], desired_goal[1]-achieved_goal[1], desired_goal[2]-achieved_goal[2]])
+        reward = -self.distance  # - 2-norm
 
-        self.distance = np.linalg.norm([targetPosition[0]-endEffectorPosition[0],targetPosition[1]-endEffectorPosition[1],targetPosition[2]-endEffectorPosition[2]])
-        reward = -self.distance # - 2-norm
-
         # Extra points
         if self.distance < 0.01:
             reward = reward + 1.5
@@ -145,7 +202,8 @@ def solved(self):
         :rtype: bool
         """
         if len(self.episodeScoreList) > 500:  # Over 500 trials thus far
-            if np.mean(self.episodeScoreList[-500:]) > 120.0:  # Last 500 episode scores average value
+            # Last 500 episode scores average value
+            if np.mean(self.episodeScoreList[-500:]) > 120.0:
                 return True
         return False
 
@@ -156,24 +214,24 @@ def get_default_observation(self):
         :return: Starting observation zero vector
         :rtype: list
         """
-        Obs = [0.0 for _ in range(self.observation_space.shape[0])]
+        Obs = [0.0 for _ in range(10)]
         Obs[3] = -0.0698
         return Obs
 
     def motorToRange(self, motorPosition, i):
-        if(i==0):
+        if(i == 0):
             motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
-        elif(i==1):
+        elif(i == 1):
             motorPosition = np.clip(motorPosition, -1.7628, 1.7628)
-        elif(i==2):
+        elif(i == 2):
             motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
-        elif(i==3):
+        elif(i == 3):
             motorPosition = np.clip(motorPosition, -3.0718, -0.0698)
-        elif(i==4):
+        elif(i == 4):
             motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
-        elif(i==5):
+        elif(i == 5):
             motorPosition = np.clip(motorPosition, -0.0175, 3.7525)
-        elif(i==6):
+        elif(i == 6):
             motorPosition = np.clip(motorPosition, -2.8972, 2.8972)
         else:
             pass
@@ -188,20 +246,23 @@ def apply_action(self, action):
         :type action: list of float
         """
         # ignore this action and keep moving
-        if action[0]==-1 and len(action)==1:
+        if action[0] == -1 and len(action) == 1:
             for i in range(7):
-                self.motorPositionArr[i] = self.positionSensorList[i].getValue()
+                self.motorPositionArr[i] = self.positionSensorList[i].getValue(
+                )
                 self.motorList[i].setVelocity(MOTOR_VELOCITY)
                 self.motorList[i].setPosition(self.motorPositionArr_target[i])
             return
-
-        self.motorPositionArr = np.array(Func.getValue(self.positionSensorList))
+
+        self.motorPositionArr = np.array(
+            Func.getValue(self.positionSensorList))
         for i in range(7):
             motorPosition = self.motorPositionArr[i] + action[i]
             motorPosition = self.motorToRange(motorPosition, i)
             self.motorList[i].setVelocity(MOTOR_VELOCITY)
             self.motorList[i].setPosition(motorPosition)
-            self.motorPositionArr_target[i]=motorPosition # Update motorPositionArr_target 
+            # Update motorPositionArr_target
+            self.motorPositionArr_target[i] = motorPosition
 
     def setup_motors(self):
         """