diff --git a/zod/cli/download.py b/zod/cli/download.py index 56be1c9..56d591b 100644 --- a/zod/cli/download.py +++ b/zod/cli/download.py @@ -1,328 +1,25 @@ """This script is to be used to download the Zenseact Open Dataset.""" -import contextlib import os -import os.path as osp -import subprocess -import tarfile -import threading -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass - -import requests -from tqdm import tqdm +from enum import Enum +from zod.cli.dropbox_download import DownloadSettings, FilterSettings +from zod.cli.dropbox_download import download_dataset as drobbox_download +from zod.cli.torrent_download import download_dataset as torrent_download from zod.cli.utils import SubDataset, Version try: - import click.exceptions - import dropbox - import dropbox.exceptions - import dropbox.files - import dropbox.sharing import typer except ImportError: print('zod is installed without the CLI dependencies: pip install "zod[cli]"') exit(1) -APP_KEY = "kcvokw7c7votepo" -REFRESH_TOKEN = "z2h_5rjphJEAAAAAAAAAAUWtQlltCYlMbZ2WqMgtymELhiSuKIksxAYeArubKnZV" -CHUNK_SIZE = 8 * 1024 * 1024 # 8 MB -TIMEOUT = 8 * 60 * 60 # 8 hours - app = typer.Typer(help="Zenseact Open Dataset Downloader", no_args_is_help=True) -@dataclass -class DownloadExtractInfo: - """Information about a file to extract.""" - - url: str - file_path: str - extract_dir: str - dl_dir: str - rm: bool - dry_run: bool - size: int - content_hash: str - extract: bool - extract_already_downloaded: bool - - -@dataclass -class FilterSettings: - """Filter settings.""" - - version: Version - annotations: bool - images: bool - blur: bool - dnat: bool - lidar: bool - oxts: bool - infos: bool - vehicle_data: bool - num_scans_before: int - num_scans_after: int - - def __str__(self): - return "\n".join([f" {key}: {value}" for key, value in self.__dict__.items()]) - - -@dataclass -class DownloadSettings: - """Download settings.""" - - url: str - output_dir: str - rm: bool - dry_run: bool - extract: bool - extract_already_downloaded: bool - parallel: bool - max_workers: int = 8 - - def __str__(self): - return "\n".join([f" {key}: {value}" for key, value in self.__dict__.items()]) - - -class ResumableDropbox(dropbox.Dropbox): - """A patched dropbox client that allows to resume downloads.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._headers = {} if self._headers is None else self._headers - self._lock = threading.Lock() - - def sharing_get_shared_link_file(self, url, path=None, link_password=None, start=0): - """ - Download the shared link's file from a user's Dropbox. - """ - # Need to lock here because the headers are shared between all requests - with self._lock: - self._headers["Range"] = f"bytes={start}-" - res = super().sharing_get_shared_link_file(url, path=path, link_password=link_password) - del self._headers["Range"] - return res - - -def _print_final_msg(pbar: tqdm, basename: str): - """Print a final message for each downloaded file, with stats and nice padding.""" - msg = "Downloaded " + basename + " " * (45 - len(basename)) - total_time = pbar.format_dict["elapsed"] - size_in_mb = pbar.n / 1024 / 1024 - speed_in_mb = size_in_mb / total_time - for name, val, unit in [ - ("time", total_time, "s"), - ("size", size_in_mb, "MB"), - ("speed", speed_in_mb, "MB/s"), - ]: - val = f"{val:.2f}{unit}" - msg += f"{name}: {val}" + " " * (14 - len(val)) - tqdm.write(msg) - - -def _download(download_path: str, dbx: ResumableDropbox, info: DownloadExtractInfo): - current_size = 0 - if osp.exists(download_path): - current_size = osp.getsize(download_path) - basename = osp.basename(info.file_path) - pbar = tqdm( - total=info.size, - unit="iB", - unit_scale=True, - desc=f"Downloading {basename}...", - leave=False, - initial=current_size, - ) - if pbar.n > info.size: - tqdm.write( - f"Error! File {download_path} already exists and is larger than expected. " - "Please delete and try again." - ) - if pbar.n > 0: - # this means we are retrying or resuming a previously interrupted download - tqdm.write(f"Resuming download of {download_path} from {current_size} bytes.") - # Retry download if partial file exists (e.g. due to network error) - while pbar.n < info.size: - try: - _, response = dbx.sharing_get_shared_link_file( - url=info.url, path=info.file_path, start=pbar.n - ) - with open(download_path, "ab") as f: - with contextlib.closing(response): - for chunk in response.iter_content(chunk_size=CHUNK_SIZE): - if chunk: # filter out keep-alive new chunks - size = f.write(chunk) - pbar.update(size) - except requests.exceptions.ChunkedEncodingError: - continue # This can happen when dropbox unexpectly closes the connection - # Final checks and printouts - assert osp.getsize(download_path) == info.size - _print_final_msg(pbar, basename) - - -def _extract(tar_path: str, output_dir: str): - """Extract a tar file to a directory.""" - pbar = tqdm(desc=f"Extracting {osp.basename(tar_path)}...", leave=False) - # Check if system has tar installed (pipe stderr to None to avoid printing) - if os.system("tar --version > /dev/null 2>&1") == 0: - with subprocess.Popen( - ["tar", "-xvf", tar_path, "-C", output_dir], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) as tar: - for _ in tar.stdout: - pbar.update(1) - # check if tar exited with error - if tar.returncode != 0: - tqdm.write(f"Error extracting {tar_path} with tar.") - else: - # Fallback to python tarfile - with tarfile.open(name=tar_path) as tar: - # pbar.total = len(tar.getmembers()) - for member in tar.getmembers(): - tar.extract(member=member, path=output_dir) - pbar.update(1) - tqdm.write(f"Extracted {pbar.n} files from {osp.basename(tar_path)}.") - - -def _already_downloaded(download_path: str, target_size: int): - """Check if the file is already downloaded.""" - old_format_path = download_path.rsplit("_", 1)[0] # /path/to/file_abc123 -> /path/to/file - if osp.exists(download_path) and osp.getsize(download_path) == target_size: - return True - elif osp.exists(old_format_path) and osp.getsize(old_format_path) == target_size: - return True - return False - - -def _download_and_extract(dbx: ResumableDropbox, info: DownloadExtractInfo): - """Download a file from a Dropbox share link to a local path.""" - should_extract = info.extract - file_name = osp.basename(info.file_path) - data_name = file_name.split(".")[0] - download_path = osp.join(info.dl_dir, f"{file_name}_{info.content_hash[:8]}") - if _already_downloaded(download_path, info.size): - if not info.extract_already_downloaded: - tqdm.write(f"{data_name} already exists. Skipping download and extraction.") - should_extract = False - else: - tqdm.write(f"{data_name} already exists. Skipping download.") - elif info.dry_run: - msg = "download" if not should_extract else "download and extract" - typer.echo(f"Would {msg} {info.file_path} to {download_path}") - return - else: - try: - _download(download_path, dbx, info) - except Exception as e: - print(f"Error downloading {data_name}: {e}. Please retry") - return - - if should_extract: - try: - _extract(download_path, info.extract_dir) - except Exception as e: - print(f"Error extracting {data_name}: {e}. Please retry") - return - - if info.rm and not info.dry_run: - os.remove(download_path) - - -def _filter_entry(entry: dropbox.files.Metadata, settings: FilterSettings) -> bool: - """Filter the entry based on the flags.""" - if settings.version == Version.MINI: - return "mini" in entry.name - elif "mini" in entry.name: - return False - - if not settings.annotations and "annotations" in entry.name: - return False - if "images" in entry.name: - if not settings.images: - return False - if not settings.blur and "blur" in entry.name: - return False - if not settings.dnat and "dnat" in entry.name: - return False - if not settings.oxts and "oxts" in entry.name: - return False - if not settings.infos and "infos" in entry.name: - return False - if not settings.vehicle_data and "vehicle_data" in entry.name: - return False - if "lidar" in entry.name: - if not settings.lidar: - return False - if "after" in entry.name or "before" in entry.name: - # this is only the case for frames, where we have surrounding frames - distance = int(entry.name.split("_")[2][:2]) - tgt = settings.num_scans_before if "before" in entry.name else settings.num_scans_after - if tgt != -1 and distance > tgt: - return False - return True - - -def _download_dataset(dl_settings: DownloadSettings, filter_settings: FilterSettings, dirname: str): - dbx = ResumableDropbox(app_key=APP_KEY, oauth2_refresh_token=REFRESH_TOKEN, timeout=TIMEOUT) - entries = _list_folder(dl_settings.url, dbx, dirname) - if not entries: - typer.echo( - "Warning! No files found. Check the url, but this could be a known dropbox error.\n" - "We are working on a fix, so please try again layer. Sorry for the inconvenience." - ) - return - dl_dir = osp.join(dl_settings.output_dir, "downloads", dirname) - files_to_download = [ - DownloadExtractInfo( - url=dl_settings.url, - file_path=f"/{dirname}/" + entry.name, - dl_dir=dl_dir, - extract_dir=dl_settings.output_dir, - size=entry.size, - content_hash=entry.content_hash, - rm=dl_settings.rm, - dry_run=dl_settings.dry_run, - extract=dl_settings.extract, - extract_already_downloaded=dl_settings.extract_already_downloaded, - ) - for entry in entries - if _filter_entry(entry, filter_settings) - ] - if not files_to_download: - typer.echo("No files to download. Perhaps you specified too many filters?") - return - - if not dl_settings.dry_run: - os.makedirs(dl_settings.output_dir, exist_ok=True) - os.makedirs(dl_dir, exist_ok=True) - if dl_settings.parallel: - with ThreadPoolExecutor(max_workers=dl_settings.max_workers) as pool: - pool.map(lambda info: _download_and_extract(dbx, info), files_to_download) - else: - for info in files_to_download: - _download_and_extract(dbx, info) - - -def _list_folder(url, dbx: ResumableDropbox, path: str): - shared_link = dropbox.files.SharedLink(url=url) - try: - res = dbx.files_list_folder(path=f"/{path}", shared_link=shared_link) - except dropbox.exceptions.ApiError as err: - raise click.exceptions.ClickException( - f"Dropbox raised the following error:\n\t{err}\nThis could be due to:" - '\n\ta) bad url. Please try it in a browser and specify with quotes (--url="").' - "\n\tb) zod bandwidth limit. Sorry about this, and please try again the next day." - "\n\tc) other error (bad internet connection, dropbox outage, etc.)." - ) - - entries = res.entries - while res.has_more: - res = dbx.files_list_folder_continue(res.cursor) - entries.extend(res.entries) - return entries +class DownloadSource(str, Enum): + DROPBOX = "dropbox" + TORRENT = "torrent" def _print_summary(download_settings, filter_settings, subset): @@ -356,7 +53,7 @@ def download( url: str = typer.Option( ..., help="The dropbox shared folder url", - prompt="What is the dropbox url to the dataset (you can get it from zod.zenseact.com)?", + prompt="What is the dropbox url to the dataset (you can get it from zod.zenseact.com)? Enter anything if you are downloading using torrent.", # noqa prompt_required=False, rich_help_panel=REQ, ), @@ -381,6 +78,15 @@ def download( prompt_required=False, rich_help_panel=REQ, ), + source: DownloadSource = typer.Option( + ..., + "--source", + "-s", + help="The source of the dataset", + prompt="Where do you want to download the dataset from? (torrent or dropbox)", + prompt_required=False, + rich_help_panel=REQ, + ), # General download settings rm: bool = typer.Option(False, help="Remove the downloaded archives", rich_help_panel=GEN), dry_run: bool = typer.Option(False, help="Print what would be downloaded", rich_help_panel=GEN), @@ -447,7 +153,11 @@ def download( f"Download with the above settings?", abort=True, ) - _download_dataset(download_settings, filter_settings, subset.folder) + + if source == DownloadSource.DROPBOX: + drobbox_download(download_settings, filter_settings, subset.folder) + elif source == DownloadSource.TORRENT: + torrent_download(download_settings, filter_settings, subset) if __name__ == "__main__": diff --git a/zod/cli/dropbox_download.py b/zod/cli/dropbox_download.py new file mode 100644 index 0000000..3866cb1 --- /dev/null +++ b/zod/cli/dropbox_download.py @@ -0,0 +1,323 @@ +"""This script is to be used to download the Zenseact Open Dataset.""" +import contextlib +import os +import os.path as osp +import subprocess +import tarfile +import threading +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass + +import requests +from tqdm import tqdm + +from zod.cli.utils import Version + +try: + import click.exceptions + import dropbox + import dropbox.exceptions + import dropbox.files + import dropbox.sharing + import typer +except ImportError: + print('zod is installed without the CLI dependencies: pip install "zod[cli]"') + exit(1) + + +APP_KEY = "kcvokw7c7votepo" +REFRESH_TOKEN = "z2h_5rjphJEAAAAAAAAAAUWtQlltCYlMbZ2WqMgtymELhiSuKIksxAYeArubKnZV" +CHUNK_SIZE = 8 * 1024 * 1024 # 8 MB +TIMEOUT = 8 * 60 * 60 # 8 hours + + +@dataclass +class DownloadExtractInfo: + """Information about a file to extract.""" + + url: str + file_path: str + extract_dir: str + dl_dir: str + rm: bool + dry_run: bool + size: int + content_hash: str + extract: bool + extract_already_downloaded: bool + + +@dataclass +class FilterSettings: + """Filter settings.""" + + version: Version + annotations: bool + images: bool + blur: bool + dnat: bool + lidar: bool + oxts: bool + infos: bool + vehicle_data: bool + num_scans_before: int + num_scans_after: int + + def __str__(self): + return "\n".join([f" {key}: {value}" for key, value in self.__dict__.items()]) + + +@dataclass +class DownloadSettings: + """Download settings.""" + + url: str + output_dir: str + rm: bool + dry_run: bool + extract: bool + extract_already_downloaded: bool + parallel: bool + max_workers: int = 8 + + def __str__(self): + return "\n".join([f" {key}: {value}" for key, value in self.__dict__.items()]) + + +class ResumableDropbox(dropbox.Dropbox): + """A patched dropbox client that allows to resume downloads.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._headers = {} if self._headers is None else self._headers + self._lock = threading.Lock() + + def sharing_get_shared_link_file(self, url, path=None, link_password=None, start=0): + """ + Download the shared link's file from a user's Dropbox. + """ + # Need to lock here because the headers are shared between all requests + with self._lock: + self._headers["Range"] = f"bytes={start}-" + res = super().sharing_get_shared_link_file(url, path=path, link_password=link_password) + del self._headers["Range"] + return res + + +def _print_final_msg(pbar: tqdm, basename: str): + """Print a final message for each downloaded file, with stats and nice padding.""" + msg = "Downloaded " + basename + " " * (45 - len(basename)) + total_time = pbar.format_dict["elapsed"] + size_in_mb = pbar.n / 1024 / 1024 + speed_in_mb = size_in_mb / total_time + for name, val, unit in [ + ("time", total_time, "s"), + ("size", size_in_mb, "MB"), + ("speed", speed_in_mb, "MB/s"), + ]: + val = f"{val:.2f}{unit}" + msg += f"{name}: {val}" + " " * (14 - len(val)) + tqdm.write(msg) + + +def _download(download_path: str, dbx: ResumableDropbox, info: DownloadExtractInfo): + current_size = 0 + if osp.exists(download_path): + current_size = osp.getsize(download_path) + basename = osp.basename(info.file_path) + pbar = tqdm( + total=info.size, + unit="iB", + unit_scale=True, + desc=f"Downloading {basename}...", + leave=False, + initial=current_size, + ) + if pbar.n > info.size: + tqdm.write( + f"Error! File {download_path} already exists and is larger than expected. " + "Please delete and try again." + ) + if pbar.n > 0: + # this means we are retrying or resuming a previously interrupted download + tqdm.write(f"Resuming download of {download_path} from {current_size} bytes.") + # Retry download if partial file exists (e.g. due to network error) + while pbar.n < info.size: + try: + _, response = dbx.sharing_get_shared_link_file( + url=info.url, path=info.file_path, start=pbar.n + ) + with open(download_path, "ab") as f: + with contextlib.closing(response): + for chunk in response.iter_content(chunk_size=CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + size = f.write(chunk) + pbar.update(size) + except requests.exceptions.ChunkedEncodingError: + continue # This can happen when dropbox unexpectly closes the connection + # Final checks and printouts + assert osp.getsize(download_path) == info.size + _print_final_msg(pbar, basename) + + +def _extract(tar_path: str, output_dir: str): + """Extract a tar file to a directory.""" + pbar = tqdm(desc=f"Extracting {osp.basename(tar_path)}...", leave=False) + # Check if system has tar installed (pipe stderr to None to avoid printing) + if os.system("tar --version > /dev/null 2>&1") == 0: + with subprocess.Popen( + ["tar", "-xvf", tar_path, "-C", output_dir], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) as tar: + for _ in tar.stdout: + pbar.update(1) + # check if tar exited with error + if tar.returncode != 0: + tqdm.write(f"Error extracting {tar_path} with tar.") + else: + # Fallback to python tarfile + with tarfile.open(name=tar_path) as tar: + # pbar.total = len(tar.getmembers()) + for member in tar.getmembers(): + tar.extract(member=member, path=output_dir) + pbar.update(1) + tqdm.write(f"Extracted {pbar.n} files from {osp.basename(tar_path)}.") + + +def _already_downloaded(download_path: str, target_size: int): + """Check if the file is already downloaded.""" + old_format_path = download_path.rsplit("_", 1)[0] # /path/to/file_abc123 -> /path/to/file + if osp.exists(download_path) and osp.getsize(download_path) == target_size: + return True + elif osp.exists(old_format_path) and osp.getsize(old_format_path) == target_size: + return True + return False + + +def _download_and_extract(dbx: ResumableDropbox, info: DownloadExtractInfo): + """Download a file from a Dropbox share link to a local path.""" + should_extract = info.extract + file_name = osp.basename(info.file_path) + data_name = file_name.split(".")[0] + download_path = osp.join(info.dl_dir, f"{file_name}_{info.content_hash[:8]}") + if _already_downloaded(download_path, info.size): + if not info.extract_already_downloaded: + tqdm.write(f"{data_name} already exists. Skipping download and extraction.") + should_extract = False + else: + tqdm.write(f"{data_name} already exists. Skipping download.") + elif info.dry_run: + msg = "download" if not should_extract else "download and extract" + typer.echo(f"Would {msg} {info.file_path} to {download_path}") + return + else: + try: + _download(download_path, dbx, info) + except Exception as e: + print(f"Error downloading {data_name}: {e}. Please retry") + return + + if should_extract: + try: + _extract(download_path, info.extract_dir) + except Exception as e: + print(f"Error extracting {data_name}: {e}. Please retry") + return + + if info.rm and not info.dry_run: + os.remove(download_path) + + +def _filter_entry(entry: dropbox.files.Metadata, settings: FilterSettings) -> bool: + """Filter the entry based on the flags.""" + if settings.version == Version.MINI: + return "mini" in entry.name + elif "mini" in entry.name: + return False + + if not settings.annotations and "annotations" in entry.name: + return False + if "images" in entry.name: + if not settings.images: + return False + if not settings.blur and "blur" in entry.name: + return False + if not settings.dnat and "dnat" in entry.name: + return False + if not settings.oxts and "oxts" in entry.name: + return False + if not settings.infos and "infos" in entry.name: + return False + if not settings.vehicle_data and "vehicle_data" in entry.name: + return False + if "lidar" in entry.name: + if not settings.lidar: + return False + if "after" in entry.name or "before" in entry.name: + # this is only the case for frames, where we have surrounding frames + distance = int(entry.name.split("_")[2][:2]) + tgt = settings.num_scans_before if "before" in entry.name else settings.num_scans_after + if tgt != -1 and distance > tgt: + return False + return True + + +def download_dataset(dl_settings: DownloadSettings, filter_settings: FilterSettings, dirname: str): + dbx = ResumableDropbox(app_key=APP_KEY, oauth2_refresh_token=REFRESH_TOKEN, timeout=TIMEOUT) + entries = _list_folder(dl_settings.url, dbx, dirname) + if not entries: + typer.echo( + "Warning! No files found. Check the url, but this could be a known dropbox error.\n" + "We are working on a fix, so please try again layer. Sorry for the inconvenience." + ) + return + dl_dir = osp.join(dl_settings.output_dir, "downloads", dirname) + files_to_download = [ + DownloadExtractInfo( + url=dl_settings.url, + file_path=f"/{dirname}/" + entry.name, + dl_dir=dl_dir, + extract_dir=dl_settings.output_dir, + size=entry.size, + content_hash=entry.content_hash, + rm=dl_settings.rm, + dry_run=dl_settings.dry_run, + extract=dl_settings.extract, + extract_already_downloaded=dl_settings.extract_already_downloaded, + ) + for entry in entries + if _filter_entry(entry, filter_settings) + ] + if not files_to_download: + typer.echo("No files to download. Perhaps you specified too many filters?") + return + + if not dl_settings.dry_run: + os.makedirs(dl_settings.output_dir, exist_ok=True) + os.makedirs(dl_dir, exist_ok=True) + if dl_settings.parallel: + with ThreadPoolExecutor(max_workers=dl_settings.max_workers) as pool: + pool.map(lambda info: _download_and_extract(dbx, info), files_to_download) + else: + for info in files_to_download: + _download_and_extract(dbx, info) + + +def _list_folder(url, dbx: ResumableDropbox, path: str): + shared_link = dropbox.files.SharedLink(url=url) + try: + res = dbx.files_list_folder(path=f"/{path}", shared_link=shared_link) + except dropbox.exceptions.ApiError as err: + raise click.exceptions.ClickException( + f"Dropbox raised the following error:\n\t{err}\nThis could be due to:" + '\n\ta) bad url. Please try it in a browser and specify with quotes (--url="").' + "\n\tb) zod bandwidth limit. Sorry about this, and please try again the next day." + "\n\tc) other error (bad internet connection, dropbox outage, etc.)." + ) + + entries = res.entries + while res.has_more: + res = dbx.files_list_folder_continue(res.cursor) + entries.extend(res.entries) + return entries diff --git a/zod/cli/torrent_download.py b/zod/cli/torrent_download.py new file mode 100644 index 0000000..4887d3c --- /dev/null +++ b/zod/cli/torrent_download.py @@ -0,0 +1,255 @@ +"""Helper script to download the dataset using academic torrents.""" + +import os +import shutil +import subprocess +import sys +import tarfile +import urllib.request +from pathlib import Path +from typing import List + +from tqdm import tqdm + +from zod.cli.dropbox_download import DownloadSettings, FilterSettings +from zod.cli.utils import SubDataset, Version +from zod.constants import MINI + +TORRENT_WEBSITE = "https://academictorrents.com/" + +ZOD_FRAMES_TORRENT = "329ae74426e067ef06b82241b5906400ca3caf03" +ZOD_SEQUENCES_TORRENT = "95ece9c22470c4f7df51a82bcc3407cc038419be" +ZOD_DRIVES_TORRENT = "a05ab2e524dc38c1498fcc9fb621329cc97e3837" + +TORRENT_FILEMAP = { + SubDataset.FRAMES: f"{TORRENT_WEBSITE}/download/{ZOD_FRAMES_TORRENT}.torrent", + SubDataset.SEQUENCES: f"{TORRENT_WEBSITE}/download/{ZOD_SEQUENCES_TORRENT}.torrent", + SubDataset.DRIVES: f"{TORRENT_WEBSITE}/download/{ZOD_DRIVES_TORRENT}.torrent", +} + +TORRENT_WEBSITEMAP = { + SubDataset.FRAMES: f"{TORRENT_WEBSITE}/details/{ZOD_FRAMES_TORRENT}", + SubDataset.SEQUENCES: f"{TORRENT_WEBSITE}/details/{ZOD_SEQUENCES_TORRENT}", + SubDataset.DRIVES: f"{TORRENT_WEBSITE}/details/{ZOD_DRIVES_TORRENT}", +} + +SIZE_FILEMAP = { + SubDataset.FRAMES: 9096553, + SubDataset.SEQUENCES: 1280237, + SubDataset.DRIVES: 290441, +} + +DRIVES_FILES_TO_IDX = { + "annotations": 1, + "drives_mini": 2, + "images_front_blur": 3, + "infos": 4, + "lidar_velodyne": 5, + "oxts": 6, + "vehicle_data": 7, +} + +SEQUENCES_FILES_TO_IDX = { + "annotations": 1, + "images_front_blur": 2, + "infos": 3, + "lidar_velodyne": 4, + "oxts": 5, + "sequences_mini": 6, + "vehicle_data": 7, +} + +FRAMES_FILES_TO_IDX = { + "annotations": 1, + "frames_mini": 2, + "images_front_blur": 3, + "images_front_dnat": 4, + "infos": 5, + "lidar_velodyne_01after": 6, + "lidar_velodyne_01before": 7, + "lidar_velodyne_02after": 8, + "lidar_velodyne_02before": 9, + "lidar_velodyne_03after": 10, + "lidar_velodyne_03before": 11, + "lidar_velodyne_04after": 12, + "lidar_velodyne_04before": 13, + "lidar_velodyne_05after": 14, + "lidar_velodyne_05before": 15, + "lidar_velodyne_06after": 16, + "lidar_velodyne_06before": 17, + "lidar_velodyne_07after": 18, + "lidar_velodyne_07before": 19, + "lidar_velodyne_08after": 20, + "lidar_velodyne_08before": 21, + "lidar_velodyne_09after": 22, + "lidar_velodyne_09before": 23, + "lidar_velodyne_10after": 24, + "lidar_velodyne_10before": 25, + "lidar_velodyne": 26, + "oxts": 27, +} + + +def check_aria_install(subset: SubDataset): + # assert that aria2c is installed + try: + subprocess.check_call(["aria2c", "--version"]) + except FileNotFoundError: + print( + "aria2c is not installed. Please install aria2c to download the dataset. Using: `apt install aria2`" # noqa + ) + print(f"Alternatively, you can download the dataset manually from: ") + print(f"\t {TORRENT_WEBSITEMAP[subset]}") + sys.exit(1) + + +def get_files_to_download(subset: SubDataset, filter_settings: FilterSettings) -> List[int]: + if subset == SubDataset.DRIVES: + files_to_idx = DRIVES_FILES_TO_IDX + elif subset == SubDataset.SEQUENCES: + files_to_idx = SEQUENCES_FILES_TO_IDX + elif subset == SubDataset.FRAMES: + files_to_idx = FRAMES_FILES_TO_IDX + else: + raise ValueError(f"Unknown subset: {subset}") + + # mini dataset + if filter_settings.version == Version.MINI: + return [f"{subset.name.lower()}_mini.tar.gz"], [files_to_idx[f"{subset.name.lower()}_mini"]] + + files = [] + # annotations + if filter_settings.annotations: + files.append("annotations") + + # camera data + if filter_settings.images: + if filter_settings.blur: + files.append("images_front_blur") + # we only have dnat images for frames + if filter_settings.dnat and subset == SubDataset.FRAMES: + files.append("images_front_dnat") + + # gnss data + if filter_settings.oxts: + files.append("oxts") + + # lidar_velodyne data + if filter_settings.lidar: + files.append("lidar_velodyne") + if subset == SubDataset.FRAMES: + if not filter_settings.num_scans_before == 0: + n_scans = ( + 10 + if filter_settings.num_scans_before == -1 + else filter_settings.num_scans_before + ) + for i in range(1, n_scans + 1): + files.append(f"lidar_velodyne_{i:02d}before") + + if not filter_settings.num_scans_after == 0: + n_scans = ( + 10 if filter_settings.num_scans_after == -1 else filter_settings.num_scans_after + ) + for i in range(1, n_scans + 1): + files.append(f"lidar_velodyne_{i:02d}after") + + # vehicle data + if filter_settings.vehicle_data: + if subset != SubDataset.FRAMES: + files.append("vehicle_data") + + # frame infos + if filter_settings.infos: + files.append("infos") + + file_indicies = [files_to_idx[f] for f in files] + + files = [f"{file}.tar.gz" for file in files] + + return files, file_indicies + + +def download_torrent_file(subset: SubDataset, output_dir: Path): + """Download the torrent file for the given subset and version.""" + url = TORRENT_FILEMAP[subset] + filename = f"{subset.name.lower()}.torrent" + print("Downloading torrent file: {}".format(filename)) + try: + urllib.request.urlretrieve(url, filename) + # move the torrent file to the output directory + os.rename(filename, output_dir / filename) + + assert os.path.getsize(output_dir / filename) == SIZE_FILEMAP[subset], ( + f"Downloaded torrent file is not the correct size. " + f"Expected: {SIZE_FILEMAP[subset]}, " + f"Actual: {os.path.getsize(output_dir / filename)}" + ) + return output_dir / filename + except Exception as e: + print(f"Failed to download torrent file: {e}") + print("Please ensure that you have the latest stable version of the development kit.") + print("If the problem persists, please start a new issue on the GitHub repository.") + exit(1) + + +def download_dataset( + dl_settings: DownloadSettings, filter_settings: FilterSettings, subset: SubDataset +): + # we have to make sure aria2c is installed + if not dl_settings.dry_run: + check_aria_install(subset) + + dl_dir = Path(dl_settings.output_dir) / "downloads" + dl_dir.mkdir(parents=True, exist_ok=True) + + torrent_filepath = download_torrent_file(subset, dl_dir) + + files, file_indicies = get_files_to_download(subset, filter_settings) + # create the aria2c command + # seed-time sets the number of seconds to seed the torrent after downloading + # continue will continue the download if the torrent has already been partially downloaded + cmd = [ + "aria2c", + f"--select-file={','.join([str(i) for i in file_indicies])}", + f"--torrent-file={torrent_filepath}", + f"--dir={str(dl_dir)}", + "--seed-time=0", + "--continue=true", + ] + + if dl_settings.dry_run: + print(f"Would download: {files}") + print("Would run: {}".format(" ".join(cmd))) + return + + try: + subprocess.check_call(cmd) + except Exception as e: + print("Failed to download the dataset. Error: {}".format(e)) + print("Please ensure that you have the latest stable version of the development kit.") + print("If the problem persists, please start a new issue on the GitHub repository.") + exit(1) + + # aria2c will download adjacent files in the torrent, so we need to remove those files + # that we don't want + dl_dir = dl_dir / subset.name.lower() + for f in dl_dir.iterdir(): + if f.name not in files: + f.unlink() + + # if we are extracting, extract the tar gz files + if dl_settings.extract: + for f in dl_dir.iterdir(): + if f.name.endswith(".tar.gz"): + with tarfile.open(f, "r:gz") as tar: + for member in tqdm( + iterable=tar.getmembers(), + total=len(tar.getmembers()), + desc=f"Extracting {f.name}", + ): + tar.extract(member=member, path=dl_settings.output_dir) + + # if we are removing the archives, remove them + if dl_settings.rm: + shutil.rmtree(dl_dir)