huggingface · albertvillanova · Jul 23, 2024 · May 3, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/setup.py b/setup.py
@@ -172,6 +172,7 @@
     "jax>=0.3.14; sys_platform != 'win32'",
     "jaxlib>=0.3.14; sys_platform != 'win32'",
     "lz4",
+    "moto[server]",
     "pyspark>=3.4",  # https://issues.apache.org/jira/browse/SPARK-40991 fixed in 3.4.0
     "py7zr",
     "rarfile>=4.0",

diff --git a/src/datasets/download/download_config.py b/src/datasets/download/download_config.py
@@ -93,7 +93,7 @@ def __post_init__(self, use_auth_token):
                 FutureWarning,
             )
             self.token = use_auth_token
-        if "hf" not in self.storage_options:
+        if self.token is not None and "hf" not in self.storage_options:
             self.storage_options["hf"] = {"token": self.token, "endpoint": config.HF_ENDPOINT}
 
     def copy(self) -> "DownloadConfig":

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -4,16 +4,19 @@
 import shutil
 import tempfile
 import time
+from contextlib import contextmanager
 from hashlib import sha256
 from multiprocessing import Pool
 from pathlib import Path
 from unittest import TestCase
 from unittest.mock import patch
 
+import boto3
 import dill
 import pyarrow as pa
 import pytest
 import requests
+from moto.server import ThreadedMotoServer
 
 import datasets
 from datasets import config, load_dataset, load_from_disk
@@ -1648,6 +1651,47 @@ def test_load_from_disk_with_default_in_memory(
         _ = load_from_disk(dataset_path)
 
 
+@contextmanager
+def moto_server():
+    with patch.dict(
+        os.environ,
+        {
+            "AWS_ENDPOINT_URL": "http://localhost:5000",
+            "AWS_DEFAULT_REGION": "us-east-1",
+            "AWS_ACCESS_KEY_ID": "FOO",
+            "AWS_SECRET_ACCESS_KEY": "BAR",
+        },
+    ):
+        server = ThreadedMotoServer()
+        server.start()
+        try:
+            yield
+        finally:
+            server.stop()
+
+
+def test_load_file_from_s3():
+    # we need server mode here because of an aiobotocore incompatibility with moto.mock_aws
+    # (https://github.com/getmoto/moto/issues/6836)
+    with moto_server():
+        # Create a mock S3 bucket
+        bucket_name = "test-bucket"
+        s3 = boto3.client("s3", region_name="us-east-1")
+        s3.create_bucket(Bucket=bucket_name)
+
+        # Upload a file to the mock bucket
+        key = "test-file.csv"
+        csv_data = "Island\nIsabela\nBaltra"
+
+        s3.put_object(Bucket=bucket_name, Key=key, Body=csv_data)
+
+        # Load the file from the mock bucket
+        ds = datasets.load_dataset("csv", data_files={"train": "s3://test-bucket/test-file.csv"})
+
+        # Check if the loaded content matches the original content
+        assert list(ds["train"]) == [{"Island": "Isabela"}, {"Island": "Baltra"}]
+
+
 @pytest.mark.integration
 def test_remote_data_files():
     repo_id = "hf-internal-testing/raw_jsonl"