huggingface · mshukor · Oct 3, 2024 · Oct 4, 2024 · Oct 7, 2024 · Oct 9, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -65,14 +65,18 @@ htmlcov/
 .nox/
 .coverage
 .coverage.*
-.cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 
+# Ignore .cache except calibration
+.cache/*
+!.cache/calibration/
+!.cache/calibration/**
+
 # Translations
 *.mo
 *.pot

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -21,7 +21,7 @@ Provide a simple way for the reviewer to try out your changes.
 
 Examples:
 ```bash
-DATA_DIR=tests/data pytest -sx tests/test_stuff.py::test_something
+pytest -sx tests/test_stuff.py::test_something
 ```
 ```bash
 python lerobot/scripts/train.py --some.option=true

diff --git a/.github/workflows/nightly-tests.yml b/.github/workflows/nightly-tests.yml
@@ -7,10 +7,8 @@ on:
   schedule:
     - cron: "0 2 * * *"
 
-env:
-  DATA_DIR: tests/data
+# env:
   # SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
-
 jobs:
   run_all_tests_cpu:
     name: CPU
@@ -30,13 +28,9 @@ jobs:
         working-directory: /lerobot
     steps:
       - name: Tests
-        env:
-          DATA_DIR: tests/data
         run: pytest -v --cov=./lerobot --disable-warnings tests
 
       - name: Tests end-to-end
-        env:
-          DATA_DIR: tests/data
         run: make test-end-to-end
 
 

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -11,6 +11,7 @@ on:
       - ".github/**"
       - "poetry.lock"
       - "Makefile"
+      - ".cache/**"
   push:
     branches:
       - main
@@ -21,27 +22,31 @@ on:
       - ".github/**"
       - "poetry.lock"
       - "Makefile"
+      - ".cache/**"
 
 jobs:
   pytest:
     name: Pytest
     runs-on: ubuntu-latest
     env:
-      DATA_DIR: tests/data
       MUJOCO_GL: egl
     steps:
       - uses: actions/checkout@v4
         with:
           lfs: true  # Ensure LFS files are pulled
 
       - name: Install apt dependencies
-        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev ffmpeg
+      # portaudio19-dev is needed to install pyaudio
+        run: |
+          sudo apt-get update && \
+          sudo apt-get install -y libegl1-mesa-dev ffmpeg portaudio19-dev
 
       - name: Install poetry
         run: |
           pipx install poetry && poetry config virtualenvs.in-project true
           echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
 
+      # TODO(rcadene, aliberts): python 3.12 seems to be used in the tests, not python 3.10
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
         with:
@@ -60,12 +65,10 @@ jobs:
             -W ignore::UserWarning:gymnasium.utils.env_checker:247 \
             && rm -rf tests/outputs outputs
 
-
   pytest-minimal:
     name: Pytest (minimal install)
     runs-on: ubuntu-latest
     env:
-      DATA_DIR: tests/data
       MUJOCO_GL: egl
     steps:
       - uses: actions/checkout@v4
@@ -80,6 +83,7 @@ jobs:
           pipx install poetry && poetry config virtualenvs.in-project true
           echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
 
+      # TODO(rcadene, aliberts): python 3.12 seems to be used in the tests, not python 3.10
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
         with:
@@ -97,20 +101,22 @@ jobs:
             -W ignore::UserWarning:gymnasium.utils.env_checker:247 \
             && rm -rf tests/outputs outputs
 
-
+  # TODO(aliberts, rcadene): redesign after v2 migration / removing hydra
   end-to-end:
     name: End-to-end
     runs-on: ubuntu-latest
     env:
-      DATA_DIR: tests/data
       MUJOCO_GL: egl
     steps:
       - uses: actions/checkout@v4
         with:
           lfs: true  # Ensure LFS files are pulled
 
       - name: Install apt dependencies
-        run: sudo apt-get update && sudo apt-get install -y libegl1-mesa-dev
+      # portaudio19-dev is needed to install pyaudio
+        run: |
+          sudo apt-get update && \
+          sudo apt-get install -y libegl1-mesa-dev portaudio19-dev
 
       - name: Install poetry
         run: |

diff --git a/.gitignore b/.gitignore
@@ -153,3 +153,6 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+
+# slurm scripts
+slurm/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -267,7 +267,7 @@ We use `pytest` in order to run the tests. From the root of the
 repository, here's how to run tests with `pytest` for the library:
 
 ```bash
-DATA_DIR="tests/data" python -m pytest -sv ./tests
+python -m pytest -sv ./tests
 ```
 
 

diff --git a/Makefile b/Makefile
@@ -30,6 +30,8 @@ test-end-to-end:
 	${MAKE} DEVICE=$(DEVICE) test-tdmpc-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-default-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-act-pusht-tutorial
+	${MAKE} DEVICE=$(DEVICE) test-act-ete-train-accelerate-amp
+	${MAKE} DEVICE=$(DEVICE) test-act-ete-eval-accelerate-amp
 
 test-act-ete-train:
 	python lerobot/scripts/train.py \
@@ -188,3 +190,31 @@ test-act-pusht-tutorial:
 		training.image_transforms.enable=true \
 		hydra.run.dir=tests/outputs/act_pusht/
 	rm lerobot/configs/policy/created_by_Makefile.yaml
+
+
+test-act-ete-train-accelerate-amp:
+	python -m accelerate.commands.launch --cpu --mixed-precision=fp16 lerobot/scripts/train.py \
+		policy=act \
+		policy.dim_model=64 \
+		env=aloha \
+		wandb.enable=False \
+		training.offline_steps=2 \
+		training.online_steps=0 \
+		eval.n_episodes=1 \
+		eval.batch_size=1 \
+		device=$(DEVICE) \
+		training.save_checkpoint=true \
+		training.save_freq=2 \
+		policy.n_action_steps=20 \
+		policy.chunk_size=20 \
+		training.batch_size=2 \
+		hydra.run.dir=tests/outputs/act_amp/ \
+		training.image_transforms.enable=true
+
+test-act-ete-eval-accelerate-amp:
+	python -m accelerate.commands.launch --cpu --mixed-precision=fp16 lerobot/scripts/eval.py \
+		-p tests/outputs/act_amp/checkpoints/000002/pretrained_model \
+		eval.n_episodes=1 \
+		eval.batch_size=1 \
+		env.episode_length=8 \
+		device=$(DEVICE)
diff --git a/README.md b/README.md
@@ -23,15 +23,15 @@
 </div>
 
 <h2 align="center">
-    <p><a href="https://github.com/huggingface/lerobot/blob/main/examples/7_get_started_with_real_robot.md">Hot new tutorial: Getting started with real-world robots</a></p>
+    <p><a href="https://github.com/huggingface/lerobot/blob/main/examples/10_use_so100.md">New robot in town: SO-100</a></p>
 </h2>
 
 <div align="center">
-    <img src="media/tutorial/koch_v1_1_leader_follower.webp?raw=true" alt="Koch v1.1 leader and follower arms" title="Koch v1.1 leader and follower arms" width="50%">
-    <p>We just dropped an in-depth tutorial on how to build your own robot!</p>
+    <img src="media/so100/leader_follower.webp?raw=true" alt="SO-100 leader and follower arms" title="SO-100 leader and follower arms" width="50%">
+    <p>We just added a new tutorial on how to build a more affordable robot, at the price of $110 per arm!</p>
     <p>Teach it new skills by showing it a few moves with just a laptop.</p>
     <p>Then watch your homemade robot act autonomously 🤯</p>
-    <p>For more info, see <a href="https://x.com/RemiCadene/status/1825455895561859185">our thread on X</a> or <a href="https://github.com/huggingface/lerobot/blob/main/examples/7_get_started_with_real_robot.md">our tutorial page</a>.</p>
+    <p>Follow the link to the <a href="https://github.com/huggingface/lerobot/blob/main/examples/10_use_so100.md">full tutorial for SO-100</a>.</p>
 </div>
 
 <br/>
@@ -55,9 +55,9 @@
 
 <table>
   <tr>
-    <td><img src="http://remicadene.com/assets/gif/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
-    <td><img src="http://remicadene.com/assets/gif/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
-    <td><img src="http://remicadene.com/assets/gif/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
+    <td><img src="media/gym/aloha_act.gif" width="100%" alt="ACT policy on ALOHA env"/></td>
+    <td><img src="media/gym/simxarm_tdmpc.gif" width="100%" alt="TDMPC policy on SimXArm env"/></td>
+    <td><img src="media/gym/pusht_diffusion.gif" width="100%" alt="Diffusion policy on PushT env"/></td>
   </tr>
   <tr>
     <td align="center">ACT policy on ALOHA env</td>
@@ -66,6 +66,11 @@
   </tr>
 </table>
 
+### News
+
+* **1-11-2024**: we support the `accelerate` library for distributed training and evaluation on multiple GPUs.
+
+
 ### Acknowledgment
 
 - Thanks to Tony Zaho, Zipeng Fu and colleagues for open sourcing ACT policy, ALOHA environments and datasets. Ours are adapted from [ALOHA](https://tonyzhaozh.github.io/aloha) and [Mobile ALOHA](https://mobile-aloha.github.io).
@@ -144,7 +149,7 @@ wandb login
 
 ### Visualize datasets
 
-Check out [example 1](./examples/1_load_lerobot_dataset.py) that illustrates how to use our dataset class which automatically download data from the Hugging Face hub.
+Check out [example 1](./examples/1_load_lerobot_dataset.py) that illustrates how to use our dataset class which automatically downloads data from the Hugging Face hub.
 
 You can also locally visualize episodes from a dataset on the hub by executing our script from the command line:
 ```bash
@@ -153,10 +158,12 @@ python lerobot/scripts/visualize_dataset.py \
     --episode-index 0
 ```
 
-or from a dataset in a local folder with the root `DATA_DIR` environment variable (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)
+or from a dataset in a local folder with the `root` option and the `--local-files-only` (in the following case the dataset will be searched for in `./my_local_data_dir/lerobot/pusht`)
 ```bash
-DATA_DIR='./my_local_data_dir' python lerobot/scripts/visualize_dataset.py \
+python lerobot/scripts/visualize_dataset.py \
     --repo-id lerobot/pusht \
+    --root ./my_local_data_dir \
+    --local-files-only 1 \
     --episode-index 0
 ```
 
@@ -208,12 +215,10 @@ dataset attributes:
 
 A `LeRobotDataset` is serialised using several widespread file formats for each of its parts, namely:
 - hf_dataset stored using Hugging Face datasets library serialization to parquet
-- videos are stored in mp4 format to save space or png files
-- episode_data_index saved using `safetensor` tensor serialization format
-- stats saved using `safetensor` tensor serialization format
-- info are saved using JSON
+- videos are stored in mp4 format to save space
+- metadata are stored in plain json/jsonl files
 
-Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can set the `DATA_DIR` environment variable to your root dataset folder as illustrated in the above section on dataset visualization.
+Dataset can be uploaded/downloaded from the HuggingFace hub seamlessly. To work on a local dataset, you can use the `local_files_only` argument and specify its location with the `root` argument if it's not in the default `~/.cache/huggingface/lerobot` location.
 
 ### Evaluate a pretrained policy
 
@@ -280,12 +285,36 @@ To use wandb for logging training and evaluation curves, make sure you've run `w
     wandb.enable=true
 ```
 
-A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser. Please also check [here](https://github.com/huggingface/lerobot/blob/main/examples/4_train_policy_with_script.md#typical-logs-and-metrics) for the explaination of some commonly used metrics in logs.
+A link to the wandb logs for the run will also show up in yellow in your terminal. Here is an example of what they look like in your browser. Please also check [here](https://github.com/huggingface/lerobot/blob/main/examples/4_train_policy_with_script.md#typical-logs-and-metrics) for the explanation of some commonly used metrics in logs.
 
 ![](media/wandb.png)
 
 Note: For efficiency, during training every checkpoint is evaluated on a low number of episodes. You may use `eval.n_episodes=500` to evaluate on more episodes than the default. Or, after training, you may want to re-evaluate your best checkpoints on more episodes or change the evaluation settings. See `python lerobot/scripts/eval.py --help` for more instructions.
 
+### Distributed training end evaluation on multiple GPUs/Nodes:
+
+We use the [accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch) library to handle training/evaluating on multiple GPUs/nodes.
+
+To perform distributed training you should use the `python -m accelerate.commands.launch` command. Here’s an example of launching a training script across 2 GPUs :
+
+```bash
+accelerate launch --num_processes=2 lerobot/scripts/train.py \
+    policy=act \
+    env=aloha \
+    env.task=AlohaTransferCube-v0 \
+    dataset_repo_id=lerobot/aloha_sim_transfer_cube_human \
+```
+Check out [example 7](./examples/12_train_policy_accelerate.py)
+
+(Note: Make sure you accelerate is installed otherwise do: `pip install accelerate`)
+
+And to evaluate a policy you can use the following:
+```
+accelerate launch --num_processes=1 --mixed_precision=fp16 lerobot/scripts/eval.py -p lerobot/diffusion_pusht
+```
+
+Note that to reproduce the same results across different GPUs configurations, you should take into account several hyperparameters (explained more [here](https://huggingface.co/docs/accelerate/v1.1.0/en/concept_guides/performance)). In particular, the batch size is multiplied by the number of GPUs, so you should either divide the batch size or the number of training steps by the number of GPUs (be carefull if you are using an lr scheduler).
+
 #### Reproduce state-of-the-art (SOTA)
 
 We have organized our configuration files (found under [`lerobot/configs`](./lerobot/configs)) such that they reproduce SOTA results from a given model variant in their respective original works. Simply running:
-Original file line number
+Diff line change
@@ Expand Up @@
     repository, here's how to run tests with `pytest` for the library:
     ```bash
-    DATA_DIR="tests/data" python -m pytest -sv ./tests
+    python -m pytest -sv ./tests
     ```
@@ Expand Down @@