diff --git a/.circleci/config.yml b/.circleci/config.yml index ed20728f85..bf0c0d2ea7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -400,13 +400,18 @@ jobs: - run: &download_test_data name: Download test data command: | + sudo apt install git-lfs + git --version + git-lfs --version export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH . activate habitat; + git lfs install + conda install -y gitpython if [ ! -d ./data/scene_datasets/habitat-test-scenes/van-gogh-room.glb ] then cd habitat-sim - python src_python/habitat_sim/utils/datasets_download.py --uids ci_test_assets --replace --data-path data/ + python src_python/habitat_sim/utils/datasets_download.py --uids ci_test_assets --replace --data-path data/ --no-prune fi - run: diff --git a/DATASETS.md b/DATASETS.md index d0d5d6d4a1..20fdacf426 100644 --- a/DATASETS.md +++ b/DATASETS.md @@ -1,4 +1,6 @@ -# Datasets commonly used with Habitat-Sim +# HowTo Use Common Supported Datasets with Habitat-Sim + +🤗 View the open-source collection of Habitat-ready datasets and test assets on Hugging Face at https://huggingface.co/ai-habitat! ## Table of contents 1. [Habitat test scenes](#habitat-test-scenes) @@ -13,7 +15,10 @@ 1. [YCB Benchmarks - Object and Model Set](#ycb-benchmarks---object-and-model-set) 1. [Previewing dataset assets using Habitat-Sim's viewers](#previewing-dataset-assets-using-habitat-sims-viewers) +___ + ## Habitat test scenes +[🤗 Browse on Hugging Face 🤗](https://huggingface.co/datasets/ai-habitat/habitat_test_scenes) We provide 3 example scenes for performing unit tests in habitat-sim. These can be programmatically downloaded via Habitat's data download utility. @@ -27,6 +32,8 @@ We also provide PointNav episodes sampled from these scenes for performing unit python -m habitat_sim.utils.datasets_download --uids habitat_test_pointnav_dataset --data-path data/ ``` +___ + ## Habitat-Matterport 3D Research Dataset (HM3D) Details: [https://aihabitat.org/datasets/hm3d/](https://aihabitat.org/datasets/hm3d/). @@ -127,18 +134,24 @@ To load semantic annotations in habitat-lab: Note that if you are using the RL environment from habitat-lab, `SIMULATOR.SCENE_DATASET` is overridden by the episode dataset (see [here](https://github.com/facebookresearch/habitat-lab/blob/e934b15c35233457cc3cb9c90ba0e207610dbd19/habitat/core/env.py#L94-L96)). Each episode in the episode dataset must point to the annotation config file (as done in the HM3D ObjectNav dataset [here](https://github.com/facebookresearch/habitat-lab)). +___ + ## Habitat Synthetic Scene Dataset (HSSD) Details: [https://3dlg-hcvc.github.io/hssd/](https://3dlg-hcvc.github.io/hssd/). You can download the HSSD dataset from instructions provided in the above website. +___ + ## AI2-THOR (Habitat) Details: [https://3dlg-hcvc.github.io/hssd/](https://3dlg-hcvc.github.io/hssd/). You can download Habitat-compatible versions of the iTHOR, RoboTHOR, and ProcTHOR scene datasets from instructions provided in the above website. +___ + ## Matterport3D (MP3D) dataset @@ -158,11 +171,13 @@ You only need the habitat zip archive and not the entire Matterport3D dataset. Once you have the habitat zip archive, you should download [this SceneDatasetConfig file](http://dl.fbaipublicfiles.com/habitat/mp3d/config_v1/mp3d.scene_dataset_config.json) and place it in the root directory for the Matterport3D dataset (e.g. Habitat-Sim/data/scene_datasets/mp3d/). This file should then be specified as [the scene dataset config in the SimulatorConfiguration structure](/examples/tutorials/nb_python/ReplicaCAD_quickstart.py#L145) like this example for the ReplicaCAD dataset. +___ + ## Gibson and 3DSceneGraph datasets -- The Gibson dataset for use with Habitat can be downloaded by agreeing to the terms of use in the [Gibson](https://github.com/StanfordVL/GibsonEnv#database) repository. +The Gibson dataset for use with Habitat can be downloaded by agreeing to the terms of use in the [Gibson](https://github.com/StanfordVL/GibsonEnv#database) repository. -- Semantic information for Gibson is available from the [3DSceneGraph](https://3dscenegraph.stanford.edu/) dataset. The semantic data will need to be converted before it can be used within Habitat: +Semantic information for Gibson is available from the [3DSceneGraph](https://3dscenegraph.stanford.edu/) dataset. The semantic data will need to be converted before it can be used within Habitat: ```bash tools/gen_gibson_semantics.sh /path/to/3DSceneGraph_medium/automated_graph /path/to/GibsonDataset /path/to/output ``` @@ -170,13 +185,20 @@ Once you have the habitat zip archive, you should download [this SceneDatasetCon Once you have downloaded the Gibson dataset and converted the semantic data, you should download [this SceneDatasetConfig file](http://dl.fbaipublicfiles.com/habitat/gibson/config_v1/gibson_semantic.scene_dataset_config.json) and place it in the root directory for the Gibson dataset (e.g. Habitat-Sim/data/scene_datasets/gibson/). This file should then be specified as [the scene dataset config in the SimulatorConfiguration structure](/examples/tutorials/nb_python/ReplicaCAD_quickstart.py#L145) like this example for the ReplicaCAD dataset. +___ + ## Replica Dataset Details and download isntructions: [https://github.com/facebookresearch/Replica-Dataset](https://github.com/facebookresearch/Replica-Dataset). +___ + ## ReplicaCAD +🤗 Browse on Hugging Face ([interactive](https://huggingface.co/datasets/ai-habitat/ReplicaCAD_dataset), [baked_lighting](https://huggingface.co/datasets/ai-habitat/ReplicaCAD_baked_lighting)) 🤗 -Details and download instructions: [https://aihabitat.org/datasets/replica_cad/](https://aihabitat.org/datasets/replica_cad/). +Details and instructions: [https://aihabitat.org/datasets/replica_cad/](https://aihabitat.org/datasets/replica_cad/). + +___ ## ScanNet @@ -190,13 +212,15 @@ The exported `*.glb` files can directly be used with Habitat-Sim versions >= 2.0 Note: Depending on the configured radius and height of the agent, certain scans may have no navigable locations on the navmesh (~200). These scenes can be filtered out by checking if `sim.pathfinder.is_loaded` is False. +___ + ## YCB Benchmarks - Object and Model Set +[🤗 Browse Habitat-ready asset dataset on Hugging Face 🤗](https://huggingface.co/datasets/ai-habitat/ycb) + Details: [https://www.ycbbenchmarks.com/](https://www.ycbbenchmarks.com/). > YCB Object and Model Set is designed for facilitating benchmarking in robotic manipulation... The set is associated with a [model database](http://www.ycbbenchmarks.com/object-models/) which provides mesh models and high-resolution RGB-D scans of the objects for easy incorporation into manipulation and planning software platforms. -Pre-processed, [Habitat-ready assets](https://dl.fbaipublicfiles.com/habitat/ycb/hab_ycb_v1.2.zip). - Quick-start with the dataset_downloader utility: ``` @@ -227,6 +251,8 @@ habitat-viewer --use-default-lighting --enable-physics --object-dir "" --datase ``` Then press `'o'` key to add random objects from the dataset. +___ + # Previewing dataset assets using Habitat-Sim's viewers For datasets with scene dataset configuration support (such as HM3D, ReplicaCAD, MP3D, Gibson, etc) you can preview the assets using one of Habitat's command-line driven viewers, either in c++ or python. When launching the viewer, you should specify not only the desired scene to load, but also the specifying the scene dataset configuration file, to guarantee the assets load and display correctly. This has the added benefit of providing quick access to other scenes in the same dataset, without requiring a reload of the entire simulation environment from the command line. diff --git a/README.md b/README.md index 1675ed6f73..985fe0824b 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ Can't find the answer to your question? Try asking the developers and community ## Datasets -[Common datasets used with Habitat](DATASETS.md). +[HowTo use common supported datasets with Habitat-Sim](DATASETS.md). ## External Contributions diff --git a/src_python/habitat_sim/utils/datasets_download.py b/src_python/habitat_sim/utils/datasets_download.py index bd48b30f01..0eba868dfa 100755 --- a/src_python/habitat_sim/utils/datasets_download.py +++ b/src_python/habitat_sim/utils/datasets_download.py @@ -17,8 +17,11 @@ import tarfile import traceback import zipfile +from shutil import which from typing import List, Optional +from git import Repo + data_sources = {} data_groups = {} @@ -65,10 +68,9 @@ def initialize_test_data_sources(data_path): # "version": data version tag # } "habitat_test_scenes": { - "source": "http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes_v1.0.zip", - "package_name": "habitat-test-scenes_v1.0.zip", + "source": "https://huggingface.co/datasets/ai-habitat/habitat_test_scenes.git", "link": data_path + "scene_datasets/habitat-test-scenes", - "version": "1.0", + "version": "v1.0", }, "habitat_test_pointnav_dataset": { "source": "http://dl.fbaipublicfiles.com/habitat/habitat-test-pointnav-dataset_v1.0.zip", @@ -109,22 +111,19 @@ def initialize_test_data_sources(data_path): "version": "1.0", }, "replica_cad_dataset": { - "source": "https://dl.fbaipublicfiles.com/habitat/ReplicaCAD/ReplicaCAD_dataset_v1.6.zip", - "package_name": "ReplicaCAD_dataset_v1.6.zip", + "source": "https://huggingface.co/datasets/ai-habitat/ReplicaCAD_dataset.git", "link": data_path + "replica_cad", - "version": "1.6", + "version": "v1.6", }, "replica_cad_baked_lighting": { - "source": "https://dl.fbaipublicfiles.com/habitat/ReplicaCAD/ReplicaCAD_baked_lighting_v1.6.zip", - "package_name": "ReplicaCAD_baked_lighting_v1.6.zip", + "source": "https://huggingface.co/datasets/ai-habitat/ReplicaCAD_baked_lighting.git", "link": data_path + "replica_cad_baked_lighting", - "version": "1.6", + "version": "v1.6", }, "ycb": { - "source": "https://dl.fbaipublicfiles.com/habitat/ycb/hab_ycb_v1.2.zip", - "package_name": "hab_ycb_v1.2.zip", + "source": "https://huggingface.co/datasets/ai-habitat/ycb.git", "link": data_path + "objects/ycb", - "version": "1.2", + "version": "v1.2", }, "franka_panda": { "source": "https://dl.fbaipublicfiles.com/polymetis/franka_panda.zip", @@ -133,22 +132,19 @@ def initialize_test_data_sources(data_path): "version": "1.0", }, "hab_spot_arm": { - "source": "http://dl.fbaipublicfiles.com/habitat/robots/hab_spot_arm_v2.0.zip", - "package_name": "hab_spot_arm_v2.0.zip", + "source": "https://huggingface.co/datasets/ai-habitat/hab_spot_arm.git", "link": data_path + "robots/hab_spot_arm", - "version": "2.0", + "version": "v2.0", }, "hab_stretch": { - "source": "http://dl.fbaipublicfiles.com/habitat/robots/hab_stretch_v1.0.zip", - "package_name": "hab_stretch_v1.0.zip", + "source": "https://huggingface.co/datasets/ai-habitat/hab_stretch.git", "link": data_path + "robots/hab_stretch", - "version": "1.0", + "version": "v1.0", }, "hab_fetch": { - "source": "http://dl.fbaipublicfiles.com/habitat/hab_fetch_v2.0.zip", - "package_name": "hab_fetch_v2.0.zip", + "source": "https://huggingface.co/datasets/ai-habitat/hab_fetch.git", "link": data_path + "robots/hab_fetch", - "version": "2.0", + "version": "v2.0", }, "humanoid_data": { "source": "http://dl.fbaipublicfiles.com/habitat/humanoids/humanoid_data_v0.1.zip", @@ -428,9 +424,15 @@ def prompt_yes_no(message): print("Invalid answer...") -def get_version_dir(uid, data_path): +def get_version_dir(uid, data_path, is_repo=False): + """ + Constructs to the versioned_data directory path for the data source. + """ version_tag = data_sources[uid]["version"] - if "version_dir" in data_sources[uid]: + if is_repo: + # this is a git repo, so don't include version in the directory name + version_dir = os.path.join(data_path, "versioned_data/" + uid) + elif "version_dir" in data_sources[uid]: version_dir = os.path.join( data_path, "versioned_data", @@ -443,7 +445,10 @@ def get_version_dir(uid, data_path): return version_dir -def get_downloaded_file_list(uid, data_path): +def get_downloaded_file_list(uid: str, data_path: str) -> Optional[str]: + """ + Get the downloaded file list path configured for the data source. + """ version_tag = data_sources[uid]["version"] downloaded_file_list = None if "downloaded_file_list" in data_sources[uid]: @@ -461,15 +466,21 @@ def clean_data(uid, data_path): print(f"Data clean failed, no datasource named {uid}") return link_path = os.path.join(data_path, data_sources[uid]["link"]) - version_dir = get_version_dir(uid, data_path) + is_repo = data_sources[uid]["source"].endswith(".git") + version_dir = get_version_dir(uid, data_path, is_repo) + if not os.path.exists(version_dir) and not os.path.islink(link_path): + print(f"Found nothing to clean for datasource ({uid}).") + return downloaded_file_list = get_downloaded_file_list(uid, data_path) print( f"Cleaning datasource ({uid}). Directory: '{version_dir}'. Symlink: '{link_path}'." ) if downloaded_file_list is None: try: - shutil.rmtree(version_dir) - os.unlink(link_path) + if os.path.exists(version_dir): + shutil.rmtree(version_dir) + if os.path.islink(link_path): + os.unlink(link_path) except OSError: print("Removal error:") traceback.print_exc(file=sys.stdout) @@ -499,63 +510,76 @@ def clean_data(uid, data_path): os.rmdir(meta_dir) -def download_and_place( - uid, - data_path, +def clone_repo_source( + uid: str, + version_dir: str, + requires_auth: bool, username: Optional[str] = None, password: Optional[str] = None, - replace: Optional[bool] = None, + prune_lfs: bool = True, ): - r"""Data-source download function. Validates uid, handles existing data version, downloads data, unpacks, writes version, cleans up.""" - if not data_sources.get(uid): - print(f"Data download failed, no datasource named {uid}") - return - - # link_path = os.path.join(data_path, data_sources[uid]["link"]) - link_path = pathlib.Path(data_sources[uid]["link"]) + """ + Clones and processes a datasource hosted on a git repo (e.g. HuggingFace Dataset). + Handles authentication for gated sources. + Automatically prunes the resulting repo to reduce memory overhead. + """ version_tag = data_sources[uid]["version"] - version_dir = get_version_dir(uid, data_path) - downloaded_file_list = get_downloaded_file_list(uid, data_path) + clone_command = f" git clone --depth 1 --branch {version_tag} " + if requires_auth: + adjusted_password = password.replace(" ", "%20") + url_split = data_sources[uid]["source"].split("https://")[-1] + # NOTE: The username and password are stored in .git/config. Should we post-process this out? + clone_command += f'"https://{username}:{adjusted_password}@{url_split}"' + else: + clone_command += f"\"{data_sources[uid]['source']}\"" - # check for current version - if os.path.exists(version_dir) and ( - downloaded_file_list is None or os.path.exists(downloaded_file_list) - ): - print( - f"Existing data source ({uid}) version ({version_tag}) is current. Data located: '{version_dir}'. Symblink: '{link_path}'." - ) - replace_existing = ( - replace if replace is not None else prompt_yes_no("Replace versioned data?") - ) + # place the output in the specified directory + clone_command += f" {version_dir}" - if replace_existing: - clean_data(uid, data_path) - else: - print("=======================================================") - print( - f"Not replacing data, generating symlink ({link_path}) and aborting download." - ) - print("=======================================================") + print(f"{clone_command}") + subprocess.check_call(shlex.split(clone_command)) - if link_path.exists(): - os.unlink(link_path) - elif not link_path.parent.exists(): - link_path.parent.mkdir(parents=True, exist_ok=True) - os.symlink(src=version_dir, dst=link_path, target_is_directory=True) - assert link_path.exists(), "Failed, no symlink generated." + if prune_lfs: + # NOTE: we make this optional because older git versions don't support "-f --recent" + assert ( + which("git-lfs") is not None + ), "`git-lfs` is not installed, cannot prune and won't get lfs files, only links. Install and try again or re-run with `--no-prune`." - return + # prune the repo to reduce wasted memory consumption + prune_command = "git lfs prune -f --recent" + subprocess.check_call(shlex.split(prune_command), cwd=version_dir) - # download new version + +def checkout_repo_tag(repo: Repo, version_dir: str, tag: str): + """ + Checkout the specified tag for an existing repo with git python API. + """ + print(f" checking out {tag} and pulling changes from repo.") + repo.remote().fetch(f"{tag}") + # using git commandline wrapper, so installed lfs should be used to pull + repo.git.checkout(f"{tag}") + if which("git-lfs") is None: + print(" WARNING: git-lfs is not installed, cannot pull lfs files from links.") + else: + # NOTE: repo.remote().pull() was not correctly using lfs, so calling it directly in case lfs was installed between runs. + subprocess.check_call(shlex.split("git lfs pull"), cwd=version_dir) + + +def get_and_place_compressed_package( + uid: str, + data_path: str, + version_dir: str, + downloaded_file_list: Optional[str], + requires_auth: bool, + username: Optional[str] = None, + password: Optional[str] = None, +): + """ + Downloads and unpacks a datasource hosted as a compressed package at a URL. + Handles authentication for gated sources. + """ download_pre_args = data_sources[uid].get("download_pre_args", "") download_post_args = data_sources[uid].get("download_post_args", "") - requires_auth = data_sources[uid].get("requires_auth", False) - if requires_auth: - assert username is not None, "Usename required, please enter with --username" - assert ( - password is not None - ), "Password is required, please enter with --password" - use_curl = data_sources[uid].get("use_curl", False) if use_curl: if requires_auth: @@ -620,6 +644,95 @@ def download_and_place( with gzip.open(downloaded_file_list, "wt") as f: json.dump([extract_dir] + package_files, f) + # clean-up + os.remove(data_path + package_name) + + +def download_and_place( + uid: str, + data_path: str, + username: Optional[str] = None, + password: Optional[str] = None, + replace: Optional[bool] = None, + prune_lfs: bool = True, +): + r"""Data-source download function. Validates uid, handles existing data version, downloads data, unpacks, writes version, cleans up.""" + if not data_sources.get(uid): + print(f"Data download failed, no datasource named {uid}") + return + + is_repo = data_sources[uid]["source"].endswith(".git") + if is_repo and which("git-lfs") is None: + print( + "-\nWARNING: repo datasource detected and git-lfs is not installed. Download will be limited to lfs link files instead of full assets. \nTo address this, abort and clean datasources. Then install git lfs:\nWith Linux:\n `sudo apt install git-lfs`\n 'git lfs install'\n-" + ) + link_path = pathlib.Path(data_sources[uid]["link"]) + version_tag = data_sources[uid]["version"] + version_dir = get_version_dir(uid, data_path, is_repo) + downloaded_file_list = get_downloaded_file_list(uid, data_path) + + # check for current version + if os.path.exists(version_dir) and ( + downloaded_file_list is None or os.path.exists(downloaded_file_list) + ): + # for existing repos, checkout the specified version + if is_repo: + print(f"Found the existing repo for ({uid}): {version_dir}") + repo = Repo(version_dir) + assert not repo.bare + checkout_repo_tag(repo, version_dir, version_tag) + else: + print( + f"Existing data source ({uid}) version ({version_tag}) is current. Data located: '{version_dir}'. Symblink: '{link_path}'." + ) + + replace_existing = ( + replace if replace is not None else prompt_yes_no("Replace versioned data?") + ) + + if replace_existing and not is_repo: + clean_data(uid, data_path) + else: + print("=======================================================") + if not replace_existing: + print(f"Not replacing data, generating symlink ({link_path}).") + print(f"Generating symlink ({link_path}).") + print("=======================================================") + + if link_path.exists(): + os.unlink(link_path) + elif not link_path.parent.exists(): + link_path.parent.mkdir(parents=True, exist_ok=True) + os.symlink(src=version_dir, dst=link_path, target_is_directory=True) + assert link_path.exists(), "Failed, no symlink generated." + + return + + # download new version + requires_auth = data_sources[uid].get("requires_auth", False) + if requires_auth: + assert username is not None, "Username required, please enter with --username" + assert ( + password is not None + ), "Password is required, please enter with --password" + + if data_sources[uid]["source"].endswith(".git"): + # git dataset, clone it + clone_repo_source( + uid, version_dir, requires_auth, username, password, prune_lfs + ) + else: + # compressed package dataset (e.g. .zip or .tar), download and unpack it + get_and_place_compressed_package( + uid, + data_path, + version_dir, + downloaded_file_list, + requires_auth, + username, + password, + ) + # create a symlink to the new versioned data if link_path.exists(): os.unlink(link_path) @@ -629,9 +742,6 @@ def download_and_place( assert link_path.exists(), "Unpacking failed, no symlink generated." - # clean-up - os.remove(data_path + package_name) - print("=======================================================") print(f"Dataset ({uid}) successfully downloaded.") print(f"Source: '{version_dir}'") @@ -699,6 +809,11 @@ def main(args): default=None, help="Password to use for downloads that require authentication", ) + parser.add_argument( + "--no-prune", + action="store_true", + help="Optionally disable pruning for git-lfs repo datasources. Use this if your system git version does not support forced pruning (e.g. Ubuntu 20.x).", + ) args = parser.parse_args(args) replace = args.replace @@ -764,7 +879,12 @@ def main(args): clean_data(uid, data_path) else: download_and_place( - uid, data_path, args.username, args.password, replace + uid, + data_path, + args.username, + args.password, + replace, + prune_lfs=(not args.no_prune), )