[MLA-1768] retrain Match3 scene (#4943)

* improved settings and move to default_settings * update models
Unity-Technologies · Feb 17, 2021 · a06b1da · a06b1da
1 parent f52f19b
commit a06b1da
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 69 deletions.
diff --git a/Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VisualObs.prefab b/Project/Assets/ML-Agents/Examples/Match3/Prefabs/Match3VisualObs.prefab
@@ -89,7 +89,7 @@ MonoBehaviour:
     VectorActionDescriptions: []
     VectorActionSpaceType: 0
     hasUpgradedBrainParametersWithActionSpec: 1
-  m_Model: {fileID: 11400000, guid: 48d14da88fea74d0693c691c6e3f2e34, type: 3}
+  m_Model: {fileID: 11400000, guid: 28ccdfd7cb3d941ce8af0ab89e06130a, type: 3}
   m_InferenceDevice: 2
   m_BehaviorType: 0
   m_BehaviorName: Match3VisualObs

diff --git a/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VectorObs.onnx b/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VectorObs.onnx
diff --git a/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.nn b/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.nn
diff --git a/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.nn.meta b/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.nn.meta
diff --git a/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.onnx b/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.onnx
diff --git a/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.onnx.meta b/Project/Assets/ML-Agents/Examples/Match3/TFModels/Match3VisualObs.onnx.meta
diff --git a/config/ppo/Match3.yaml b/config/ppo/Match3.yaml
@@ -1,72 +1,48 @@
+default_settings:
+  trainer_type: ppo
+  hyperparameters:
+    batch_size: 16
+    buffer_size: 120
+    learning_rate: 0.0003
+    beta: 0.005
+    epsilon: 0.2
+    lambd: 0.99
+    num_epoch: 3
+    learning_rate_schedule: constant
+  network_settings:
+    normalize: true
+    hidden_units: 256
+    num_layers: 4
+    vis_encode_type: match3
+  reward_signals:
+    extrinsic:
+      gamma: 0.99
+      strength: 1.0
+  keep_checkpoints: 5
+  max_steps: 5000000
+  time_horizon: 128
+  summary_freq: 10000
+  threaded: true
+
 behaviors:
-  Match3VectorObs:
-    trainer_type: ppo
-    hyperparameters:
-      batch_size: 64
-      buffer_size: 12000
-      learning_rate: 0.0003
-      beta: 0.001
-      epsilon: 0.2
-      lambd: 0.99
-      num_epoch: 3
-      learning_rate_schedule: constant
-    network_settings:
-      normalize: true
-      hidden_units: 128
-      num_layers: 2
-      vis_encode_type: match3
-    reward_signals:
-      extrinsic:
-        gamma: 0.99
-        strength: 1.0
-    keep_checkpoints: 5
-    max_steps: 5000000
-    time_horizon: 1000
-    summary_freq: 10000
-    threaded: true
-  Match3VisualObs:
-    trainer_type: ppo
-    hyperparameters:
-      batch_size: 64
-      buffer_size: 12000
-      learning_rate: 0.0003
-      beta: 0.001
-      epsilon: 0.2
-      lambd: 0.99
-      num_epoch: 3
-      learning_rate_schedule: constant
-    network_settings:
-      normalize: true
-      hidden_units: 128
-      num_layers: 2
-      vis_encode_type: match3
-    reward_signals:
-      extrinsic:
-        gamma: 0.99
-        strength: 1.0
-    keep_checkpoints: 5
-    max_steps: 5000000
-    time_horizon: 1000
-    summary_freq: 10000
-    threaded: true
   Match3SimpleHeuristic:
     # Settings can be very simple since we don't care about actually training the model
     trainer_type: ppo
     hyperparameters:
-      batch_size: 64
-      buffer_size: 128
+      batch_size: 16
+      buffer_size: 120
     network_settings:
       hidden_units: 4
       num_layers: 1
     max_steps: 5000000
     summary_freq: 10000
     threaded: true
-  Match3GreedyHeuristic:
+  Match3SmartHeuristic:
     # Settings can be very simple since we don't care about actually training the model
     trainer_type: ppo
     hyperparameters:
-      batch_size: 64
-      buffer_size: 128
+      batch_size: 16
+      buffer_size: 120
     network_settings:
       hidden_units: 4
       num_layers: 1

diff --git a/docs/Learning-Environment-Examples.md b/docs/Learning-Environment-Examples.md
@@ -551,7 +551,7 @@ drop down. New pieces are spawned randomly at the top, with a chance of being
   - Observations and actions are defined with a sensor and actuator respectively.
 - Float Properties: None
 - Benchmark Mean Reward:
-  - 37.2 for visual observations
-  - 37.6 for vector observations
+  - 39.5 for visual observations
+  - 38.5 for vector observations
   - 34.2 for simple heuristic (pick a random valid move)
   - 37.0 for greedy heuristic (pick the highest-scoring valid move)