iejMac · mfarre · Jul 17, 2024 · Jul 18, 2024 · Jul 25, 2024 · Jul 29, 2024
diff --git a/README.md b/README.md
@@ -163,7 +163,7 @@ These arguments give coarse control over input/output "shape" of the dataset. Fo
 
 ## Downloading YouTube Metadata
 
-If we want to download a large amount of YouTube videos with video2dataset we can specify some parameters and also extract useful metadata as well. For directions on how to do so please see this [example](https://github.com/iejMac/video2dataset/blob/main/examples/yt_metadata.md).
+If we want to download a large amount of YouTube videos with video2dataset we can specify some parameters - including a proxy to distribute requests - and also extract useful metadata as well. For directions on how to do so please see this [example](https://github.com/iejMac/video2dataset/blob/main/examples/yt_metadata.md).
 
 ## Incremental mode
 

diff --git a/examples/yt_metadata.md b/examples/yt_metadata.md
@@ -1,3 +1,16 @@
+### Setting up yt-dlp proxy:
+#### Usage
+
+yt-dlp allows you to setup a proxy to send requests to YouTube. We surface this feature through our config file through the `proxy` and the flag `proxy-check-certificate`. If `proxy-check-certificate` is set to False, it supresses HTTPS certificate validation.
+
+```yaml
+yt_args:
+    download_size: 360
+    download_audio_rate: 44100
+    proxy: "url:port"
+    proxy-check-certificate: True / False
+```
+
 ### Download YouTube metadata & subtitles:
 #### Usage
 

diff --git a/video2dataset/data_reader.py b/video2dataset/data_reader.py
@@ -1,4 +1,5 @@
 """classes and functions for downloading videos"""
+
 import os
 import uuid
 import requests
@@ -165,13 +166,17 @@ class YtDlpDownloader:
         download_size: preferred height of video to download. Will try to download smallest video >=download_size
         download_audio_rate: same as size but with audio
         yt_metadata_args: see get_yt_metadata function docstring
+        proxy: proxy service
+        proxy-check-certificate: boolean. If False it does not check proxy's certificate
     """
 
     # TODO: maybe we just include height and width in the metadata_args
     def __init__(self, yt_args, tmp_dir, encode_formats):
         self.metadata_args = yt_args.get("yt_metadata_args", {})
         self.video_size = yt_args.get("download_size", 360)
         self.audio_rate = yt_args.get("download_audio_rate", 44100)
+        self.proxy = yt_args.get("proxy", None)
+        self.nocheckcertificate = not yt_args.get("proxy-check-certificate", True)
         self.tmp_dir = tmp_dir
         self.encode_formats = encode_formats
 
@@ -198,6 +203,9 @@ def __call__(self, url):
                 "format": audio_fmt_string,
                 "quiet": True,
             }
+            if self.proxy:
+                ydl_opts["proxy"] = self.proxy
+                ydl_opts["nocheckcertificate"] = self.nocheckcertificate
 
             err = None
             try:
@@ -222,6 +230,9 @@ def __call__(self, url):
                 "quiet": True,
                 "no_warnings": True,
             }
+            if self.proxy:
+                ydl_opts["proxy"] = self.proxy
+                ydl_opts["nocheckcertificate"] = self.nocheckcertificate
 
             err = None
             try:

diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py
@@ -1,4 +1,5 @@
 """Custom WebDataset classes"""
+
 import os
 import numpy as np
 import random
@@ -507,9 +508,13 @@ def __init__(
             main_datapipe.apply_sharding(world_size, global_rank)
             # synchronize data across processes to prevent hanging if sharding is uneven (which is likely)
             main_datapipe = main_datapipe.fullsync()
-        except RuntimeError:
-            print("torch distributed not used, not applying sharding in dataloader")
-            pass
+        except (RuntimeError, ValueError) as e:
+            if str(e) == "Default process group has not been initialized, please make sure to call init_process_group.":
+                print("torch distributed not used, not applying sharding in dataloader")
+                pass
+            else:
+                raise  # re-raise if it's a different ValueError
+
         # start shuffling accross shards for the first time to mix different datasets
         # (can be the same for all workers, just as an additional shuffled initialization)
         if shardshuffle > 1 and not resample_prefixes: