From 9d36c74e1350c1aceaa743c1a1d1ae0b620f8020 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Wed, 17 Jul 2024 16:36:14 +0200 Subject: [PATCH 1/6] do not crash when torchdistributed is not being used --- video2dataset/dataloader/custom_wds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py index 4f720524..50cc91eb 100644 --- a/video2dataset/dataloader/custom_wds.py +++ b/video2dataset/dataloader/custom_wds.py @@ -507,7 +507,7 @@ def __init__( main_datapipe.apply_sharding(world_size, global_rank) # synchronize data across processes to prevent hanging if sharding is uneven (which is likely) main_datapipe = main_datapipe.fullsync() - except RuntimeError: + except Exception as e: print("torch distributed not used, not applying sharding in dataloader") pass # start shuffling accross shards for the first time to mix different datasets From e54e60b8f30bd47bcb00b8fba9878f871934aed7 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Thu, 18 Jul 2024 11:24:35 +0000 Subject: [PATCH 2/6] allowing yt_dlp to run through a proxy --- video2dataset/data_reader.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/video2dataset/data_reader.py b/video2dataset/data_reader.py index 4dda4734..fb2c7609 100644 --- a/video2dataset/data_reader.py +++ b/video2dataset/data_reader.py @@ -165,6 +165,8 @@ class YtDlpDownloader: download_size: preferred height of video to download. Will try to download smallest video >=download_size download_audio_rate: same as size but with audio yt_metadata_args: see get_yt_metadata function docstring + proxy: proxy service + proxy-check-certificate: boolean. If False it does not check proxy's certificate """ # TODO: maybe we just include height and width in the metadata_args @@ -172,6 +174,8 @@ def __init__(self, yt_args, tmp_dir, encode_formats): self.metadata_args = yt_args.get("yt_metadata_args", {}) self.video_size = yt_args.get("download_size", 360) self.audio_rate = yt_args.get("download_audio_rate", 44100) + self.proxy = yt_args.get("proxy",None) + self.nocheckcertificate = not yt_args.get("proxy-check-certificate",True) self.tmp_dir = tmp_dir self.encode_formats = encode_formats @@ -198,6 +202,9 @@ def __call__(self, url): "format": audio_fmt_string, "quiet": True, } + if self.proxy: + ydl_opts['proxy'] = self.proxy + ydl_opts['nocheckcertificate'] = self.nocheckcertificate err = None try: @@ -222,6 +229,9 @@ def __call__(self, url): "quiet": True, "no_warnings": True, } + if self.proxy: + ydl_opts['proxy'] = self.proxy + ydl_opts['nocheckcertificate'] = self.nocheckcertificate err = None try: From 87ad8b5312ac019eca136416affc315d6abf04ac Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Thu, 25 Jul 2024 12:08:23 +0000 Subject: [PATCH 3/6] narrowing exception --- video2dataset/dataloader/custom_wds.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py index 50cc91eb..fb8f7578 100644 --- a/video2dataset/dataloader/custom_wds.py +++ b/video2dataset/dataloader/custom_wds.py @@ -507,9 +507,12 @@ def __init__( main_datapipe.apply_sharding(world_size, global_rank) # synchronize data across processes to prevent hanging if sharding is uneven (which is likely) main_datapipe = main_datapipe.fullsync() - except Exception as e: - print("torch distributed not used, not applying sharding in dataloader") - pass + except ValueError as e: + if str(e) == "Default process group has not been initialized, please make sure to call init_process_group.": + print("torch distributed not used, not applying sharding in dataloader") + else: + raise # re-raise if it's a different ValueError + # start shuffling accross shards for the first time to mix different datasets # (can be the same for all workers, just as an additional shuffled initialization) if shardshuffle > 1 and not resample_prefixes: From 79b311f8a43236f44adecd46c30a125138c59f58 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Mon, 29 Jul 2024 10:04:11 +0000 Subject: [PATCH 4/6] adding documentation + catching exception independently of torch version --- README.md | 2 +- examples/yt_metadata.md | 13 +++++++++++++ video2dataset/dataloader/custom_wds.py | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2e55c9ff..c4a061c8 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ These arguments give coarse control over input/output "shape" of the dataset. Fo ## Downloading YouTube Metadata -If we want to download a large amount of YouTube videos with video2dataset we can specify some parameters and also extract useful metadata as well. For directions on how to do so please see this [example](https://github.com/iejMac/video2dataset/blob/main/examples/yt_metadata.md). +If we want to download a large amount of YouTube videos with video2dataset we can specify some parameters - including a proxy to distribute requests - and also extract useful metadata as well. For directions on how to do so please see this [example](https://github.com/iejMac/video2dataset/blob/main/examples/yt_metadata.md). ## Incremental mode diff --git a/examples/yt_metadata.md b/examples/yt_metadata.md index 0d06faf4..c02023d4 100644 --- a/examples/yt_metadata.md +++ b/examples/yt_metadata.md @@ -1,3 +1,16 @@ +### Setting up yt-dlp proxy: +#### Usage + +yt-dlp allows you to setup a proxy to send requests to YouTube. We surface this feature through our config file through the `proxy` and the flag `proxy-check-certificate`. If `proxy-check-certificate` is set to False, it supresses HTTPS certificate validation. + +```yaml +yt_args: + download_size: 360 + download_audio_rate: 44100 + proxy: "url:port" + proxy-check-certificate: True / False +``` + ### Download YouTube metadata & subtitles: #### Usage diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py index fb8f7578..61c6c73c 100644 --- a/video2dataset/dataloader/custom_wds.py +++ b/video2dataset/dataloader/custom_wds.py @@ -507,9 +507,10 @@ def __init__( main_datapipe.apply_sharding(world_size, global_rank) # synchronize data across processes to prevent hanging if sharding is uneven (which is likely) main_datapipe = main_datapipe.fullsync() - except ValueError as e: + except (RuntimeError, ValueError) as e: if str(e) == "Default process group has not been initialized, please make sure to call init_process_group.": print("torch distributed not used, not applying sharding in dataloader") + pass else: raise # re-raise if it's a different ValueError From 3fc36aac27d224be4619bde0c556101931e58d54 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Tue, 30 Jul 2024 06:27:21 +0000 Subject: [PATCH 5/6] reformats --- video2dataset/data_reader.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/video2dataset/data_reader.py b/video2dataset/data_reader.py index fb2c7609..fc94774d 100644 --- a/video2dataset/data_reader.py +++ b/video2dataset/data_reader.py @@ -1,4 +1,5 @@ """classes and functions for downloading videos""" + import os import uuid import requests @@ -174,8 +175,8 @@ def __init__(self, yt_args, tmp_dir, encode_formats): self.metadata_args = yt_args.get("yt_metadata_args", {}) self.video_size = yt_args.get("download_size", 360) self.audio_rate = yt_args.get("download_audio_rate", 44100) - self.proxy = yt_args.get("proxy",None) - self.nocheckcertificate = not yt_args.get("proxy-check-certificate",True) + self.proxy = yt_args.get("proxy", None) + self.nocheckcertificate = not yt_args.get("proxy-check-certificate", True) self.tmp_dir = tmp_dir self.encode_formats = encode_formats @@ -203,8 +204,8 @@ def __call__(self, url): "quiet": True, } if self.proxy: - ydl_opts['proxy'] = self.proxy - ydl_opts['nocheckcertificate'] = self.nocheckcertificate + ydl_opts["proxy"] = self.proxy + ydl_opts["nocheckcertificate"] = self.nocheckcertificate err = None try: @@ -230,8 +231,8 @@ def __call__(self, url): "no_warnings": True, } if self.proxy: - ydl_opts['proxy'] = self.proxy - ydl_opts['nocheckcertificate'] = self.nocheckcertificate + ydl_opts["proxy"] = self.proxy + ydl_opts["nocheckcertificate"] = self.nocheckcertificate err = None try: From cd6e03521ae0de8f0d46694f408fae14365cd7ee Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Tue, 30 Jul 2024 06:29:09 +0000 Subject: [PATCH 6/6] black/lint change --- video2dataset/dataloader/custom_wds.py | 1 + 1 file changed, 1 insertion(+) diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py index 61c6c73c..e8de2e69 100644 --- a/video2dataset/dataloader/custom_wds.py +++ b/video2dataset/dataloader/custom_wds.py @@ -1,4 +1,5 @@ """Custom WebDataset classes""" + import os import numpy as np import random