huggingface · LysandreJik · Mar 15, 2022 · Mar 2, 2022 · Mar 2, 2022 · Mar 2, 2022
diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py
@@ -86,20 +86,49 @@ def hf_hub_url(
     repo_type: Optional[str] = None,
     revision: Optional[str] = None,
 ) -> str:
-    """
-    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
-    to Cloudfront (a Content Delivery Network, or CDN) for large files (more than a few MBs).
-
-    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
-    bandwidth costs).
+    """Resolve a url of a file from the given information.
-    """Resolve a url of a file from the given information.
+    """Creates a URL of a file based in the given information.
-    """Resolve a url of a file from the given information.
+    """Creates a URL of a file based in the given information.
 
-    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
-    because we implement a git-based versioning system on huggingface.co, which means that we store the files on S3/Cloudfront
-    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
-    can't ever be stale.
+    The resolved address can either be a huggingface.co-hosted url, or a link
+    to Cloudfront (a Content Delivery Network, or CDN) for large files which
+    are more than a few MBs.
 
-    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object's ETag is:
-    its git-sha1 if stored in git, or its sha256 if stored in git-lfs.
+    Args:
+        repo_id: A user or an organization name and a repo name seperated by a
+            ``/``.
+
+        filename: The name of the file in the repo.
-        filename: The name of the file in the repo.
+        filename (``str``): 
+            The name of the file in the repo.
-        filename: The name of the file in the repo.
+        filename (``str``): 
+            The name of the file in the repo.
+
+        subfolder: An optional value corresponding to a folder inside the model
+            repo.
+
+        repo_type: Set to :obj:`"dataset"` or :obj:`"space"` if uploading
+            to a dataset or space, :obj:`None` or :obj:`"model"` if uploading
+            to a model. Default is :obj:`None`.
+
+        revision: An optional Git revision id which can be a branch name, a
+            tag, or a commit hash.
+
+    Example:
+        >>> from huggingface_hub import hf_hub_url
+        >>> hf_hub_url(
+        ...    repo_id="julien-c/EsperBERTo-small", filename="pytorch_model.bin"
+        ... )
+        'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
+
+    Notes:
+        Cloudfront is replicated over the globe so downloads are way faster for
+        the end user (and it also lowers our bandwidth costs).
+
+        Cloudfront aggressively caches files by default (default TTL is 24
+        hours), however this is not an issue here because we implement a
+        git-based versioning system on huggingface.co, which means that we
+        store the files on S3/Cloudfront in a content-addressable way (i.e.,
+        the file name is its hash). Using content-addressable filenames means
+        cache can't ever be stale.
+
+        In terms of client-side caching from this library, we base our caching
+        on the objects' ETag. An object's ETag is: its git-sha1 if stored in
+        git, or its sha256 if stored in git-lfs.
     """
     if subfolder is not None:
         filename = f"{subfolder}/{filename}"
@@ -118,11 +147,20 @@ def hf_hub_url(
 
 
 def url_to_filename(url: str, etag: Optional[str] = None) -> str:
-    """
-    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
-    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
-    identify it as a HDF5 file (see
+    """Generate a local filename from a url.
+
+    Convert `url` into a hashed filename in a reproducible way. If `etag` is
+    specified, append its hash to the url's, delimited by a period. If the url
+    ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0
+    can identify it as a HDF5 file (see
     https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+
+    Args:
+        url: The address to the file
+        etag: The ETag of the file
+
+    Returns:
+        The generated filename.
     """
     url_bytes = url.encode("utf-8")
     filename = sha256(url_bytes).hexdigest()
@@ -168,8 +206,16 @@ def http_user_agent(
     library_version: Optional[str] = None,
     user_agent: Union[Dict, str, None] = None,
 ) -> str:
-    """
-    Formats a user-agent string with basic info about a request.
+    """Formats a user-agent string with basic info about a request.
+
+    Args:
+        library_name: The name of the library to which the object corresponds.
+        library_version: The version of the library.
+        user_agent: The user agent info in the form of a dictionary or a single
+            string.
+
+    Returns:
+        The formated user-agent string.
     """
     if library_name is not None:
         ua = f"{library_name}/{library_version}"
@@ -216,12 +262,21 @@ def _request_with_retry(
     Note that if the environment variable HF_HUB_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
 
     Args:
-        method (str): HTTP method, such as 'GET' or 'HEAD'
-        url (str): The URL of the ressource to fetch
-        max_retries (int): Maximum number of retries, defaults to 0 (no retries)
-        base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
-            retries then grows exponentially, capped by max_wait_time.
-        max_wait_time (float): Maximum amount of time between two retries, in seconds
+        method: HTTP method, such as 'GET' or 'HEAD'
+
+        url: The URL of the ressource to fetch
+
+        max_retries: Maximum number of retries, defaults to 0 (no retries)
+
+        base_wait_time: Duration (in seconds) to wait before retrying the first
+            time. Wait time between retries then grows exponentially, capped by
+            ``max_wait_time``.
+
+        max_wait_time: Maximum amount of time between two retries, in seconds
+
+        timeout: How many seconds to wait for the server to send data before
+            giving up which is passed to ``requests.request``.
+
         **params: Params to pass to `requests.request`
     """
     _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
@@ -303,15 +358,58 @@ def cached_download(
     use_auth_token: Union[bool, str, None] = None,
     local_files_only=False,
 ) -> Optional[str]:  # pragma: no cover
-    """
-    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
-    path to the cached file.
+    """Download a given file if it's not already present in the local cache.
+
+    Given a URL, this function looks for the corresponding file in the local
+    cache. If it's not there, download it. Then return the path to the cached
+    file.
+
+    Args:
+        url: The path to the file to be downloaded.
+
+        library_name: The name of the library to which the object corresponds.
+
+        library_version: The version of the library.
+
+        cache_dir: Path to the folder where cached files are stored.
+
+        user_agent: The user-agent info in the form of a dictionary or a
+            string.
+
+        force_download: Whether the file should be downloaded even if it
-        force_download: Whether the file should be downloaded even if it
+        force_download (``bool``, `optional`, defaults to ``False``): 
+            Whether the file should be downloaded even if it
-        force_download: Whether the file should be downloaded even if it
+        force_download (``bool``, `optional`, defaults to ``False``): 
+            Whether the file should be downloaded even if it
+            already exists in the local cache.
+
+        force_filename: Use this name instead of a generated file name.
+
+        proxies: Dictionary mapping protocol to the URL of the proxy passed to
+            ``requests.request``.
+
+        etag_timeout: When fetching ETag, how many seconds to wait for the
+            server to send data before giving up which is passed to
+            ``requests.request``.
+
+        resume_download: If ``True``, resume a previously interrupted download.
+
+        use_auth_token: A token to be used for the download.
+            - If ``True``, the token is read from the HuggingFace config
+              folder.
+            - If a string, it's used as the authentication token.
+
+        local_files_only: If ``True``, avoid downloading the file and return
+            the path to the local cached file if it exists.
 
     Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
+        Local path (string) of file or if networking is off, last version of
+        file cached on disk.
 
     Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        - ``EnvironmentError`` if ``use_auth_token=True`` and the token cannot
+            be found.
+
+        - ``OSError`` if ETag cannot be determined.
+
+        - ``ValueError`` if the file cannot be downloaded and cannot be found
+            locally.
     """
     if cache_dir is None:
         cache_dir = HUGGINGFACE_HUB_CACHE
@@ -503,28 +601,67 @@ def hf_hub_download(
     use_auth_token: Union[bool, str, None] = None,
     local_files_only=False,
 ):
-    """
-    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co file distributed through
-    Cloudfront (a Content Delivery Network, or CDN) for large files (more than a few MBs).
+    """Download a given file if it's not already present in the local cache.
+
+    Args:
+        repo_id: A user or an organization name and a repo name seperated by a
+            ``/``.
+
+        filename: The name of the file in the repo.
 
-    The file is cached locally: look for the corresponding file in the local cache. If it's not there,
-    download it. Then return the path to the cached file.
+        subfolder: An optional value corresponding to a folder inside the model
+            repo.
 
-    Cloudfront is replicated over the globe so downloads are way faster for the end user.
+        repo_type: Set to :obj:`"dataset"` or :obj:`"space"` if uploading
+            to a dataset or space, :obj:`None` or :obj:`"model"` if uploading
+            to a model. Default is :obj:`None`.
 
-    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
-    because we implement a git-based versioning system on huggingface.co, which means that we store the files on S3/Cloudfront
-    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
-    can't ever be stale.
+        revision: An optional Git revision id which can be a branch name, a
+            tag, or a commit hash.
 
-    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object's ETag is:
-    its git-sha1 if stored in git, or its sha256 if stored in git-lfs.
+        library_name: The name of the library to which the object corresponds.
+
+        library_version: The version of the library.
+
+        cache_dir: Path to the folder where cached files are stored.
+
+        user_agent: The user-agent info in the form of a dictionary or a
+            string.
+
+        force_download: Whether the file should be downloaded even if it
+            already exists in the local cache.
+
+        force_filename: Use this name instead of a generated file name.
+
+        proxies: Dictionary mapping protocol to the URL of the proxy passed to
+            ``requests.request``.
+
+        etag_timeout: When fetching ETag, how many seconds to wait for the
+            server to send data before giving up which is passed to
+            ``requests.request``.
+
+        resume_download: If ``True``, resume a previously interrupted download.
+
+        use_auth_token: A token to be used for the download.
+            - If ``True``, the token is read from the HuggingFace config
+              folder.
+            - If a string, it's used as the authentication token.
+
+        local_files_only: If ``True``, avoid downloading the file and return
+            the path to the local cached file if it exists.
 
     Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
+        Local path (string) of file or if networking is off, last version of
+        file cached on disk.
 
     Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        - ``EnvironmentError`` if ``use_auth_token=True`` and the token cannot
+            be found.
+
+        - ``OSError`` if ETag cannot be determined.
+
+        - ``ValueError`` if the file cannot be downloaded and cannot be found
+            locally.
     """
     url = hf_hub_url(
         repo_id, filename, subfolder=subfolder, repo_type=repo_type, revision=revision

diff --git a/src/huggingface_hub/snapshot_download.py b/src/huggingface_hub/snapshot_download.py
@@ -33,7 +33,8 @@ def snapshot_download(
     allow_regex: Optional[Union[List[str], str]] = None,
     ignore_regex: Optional[Union[List[str], str]] = None,
 ) -> str:
-    """
+    """Download all files of a repo.
+
     Downloads a whole snapshot of a repo's files at the specified revision.
     This is useful when you want all files from a repo, because you don't know
     which ones you will need a priori.
@@ -43,13 +44,60 @@ def snapshot_download(
     An alternative would be to just clone a repo but this would require that
     the user always has git and git-lfs installed, and properly configured.
 
-    Note: at some point maybe this format of storage should actually replace
-    the flat storage structure we've used so far (initially from allennlp
-    if I remember correctly).
+    Args:
+        repo_id: A user or an organization name and a repo name seperated by a
+            ``/``.
+
+        revision: An optional Git revision id which can be a branch name, a
+            tag, or a commit hash.
+
+        cache_dir: Path to the folder where cached files are stored.
+
+        library_name: The name of the library to which the object corresponds.
+
+        library_version: The version of the library.
+
+        user_agent: The user-agent info in the form of a dictionary or a
+            string.
+
+        proxies: Dictionary mapping protocol to the URL of the proxy passed to
+            ``requests.request``.
+
+        etag_timeout: When fetching ETag, how many seconds to wait for the
+            server to send data before giving up which is passed to
+            ``requests.request``.
+
+        resume_download: If ``True``, resume a previously interrupted download.
+
+        use_auth_token: A token to be used for the download.
+            - If ``True``, the token is read from the HuggingFace config
+              folder.
+            - If a string, it's used as the authentication token.
+
+        local_files_only: If ``True``, avoid downloading the file and return
+            the path to the local cached file if it exists.
+
+        allow_regex: If provided, only files matching this regex are downladed.
+
+        ignore_regex: If provided, files matching this regex are not
+            downloaded.
 
     Return:
         Local folder path (string) of repo snapshot
+
+    Raises:
+        - ``EnvironmentError`` if ``use_auth_token=True`` and the token cannot
+            be found.
+
+        - ``OSError`` if ETag cannot be determined.
+
+        - ``ValueError`` if the file cannot be downloaded and cannot be found
+            locally.
     """
+    # Note: at some point maybe this format of storage should actually replace
+    # the flat storage structure we've used so far (initially from allennlp
+    # if I remember correctly).
+
     if cache_dir is None:
         cache_dir = HUGGINGFACE_HUB_CACHE
     if revision is None:
@@ -68,7 +116,7 @@ def snapshot_download(
     else:
         token = None
 
-    # remove all `/` occurances to correctly convert repo to directory name
+    # remove all `/` occurrences to correctly convert repo to directory name
     repo_id_flattened = repo_id.replace("/", REPO_ID_SEPARATOR)
 
     # if we have no internet connection we will look for the