From c49c7524c560735fbd38d4d7bea4e37b0c6078d0 Mon Sep 17 00:00:00 2001 From: Robert Xiu Date: Fri, 3 Jun 2022 14:52:22 -0700 Subject: [PATCH] Add list_file() functional API to FSSpecFileLister and IoPathFileLister (#463) Summary: Fixes https://github.com/pytorch/data/issues/387 ### Changes - Adds `list_file()` method on `IoPathFileListerIterDataPipe` - Adds `list_file()` method on `FSSpecFileListerIterDataPipe` - Add tests for those methods #### Additional comments I feel as if the implementation is quite naive. Would appreciate any feedback on it. Pull Request resolved: https://github.com/pytorch/data/pull/463 Reviewed By: NivekT Differential Revision: D36777142 Pulled By: ejguan fbshipit-source-id: 1c4474776f3fcd377ae545bd8bd7bf26d0b2fa88 --- test/test_fsspec.py | 23 +++++++++++++++++++++++ test/test_local_io.py | 11 +++++++++++ torchdata/datapipes/iter/load/fsspec.py | 1 + torchdata/datapipes/iter/load/iopath.py | 1 + 4 files changed, 36 insertions(+) diff --git a/test/test_fsspec.py b/test/test_fsspec.py index 7e596ef6e..2397ead97 100644 --- a/test/test_fsspec.py +++ b/test/test_fsspec.py @@ -64,6 +64,15 @@ def test_fsspec_file_lister_iterdatapipe(self): {fsspec.implementations.local.make_path_posix(file) for file in self.temp_sub_files}, ) + # checks for functional API + datapipe = IterableWrapper(["file://" + self.temp_sub_dir.name]) + datapipe = datapipe.list_files_by_fsspec() + for path in datapipe: + self.assertIn( + path.split("://")[1], + {fsspec.implementations.local.make_path_posix(file) for file in self.temp_sub_files}, + ) + @skipIfNoFSSpec def test_fsspec_file_lister_iterdatapipe_with_list(self): datapipe = FSSpecFileLister(root=["file://" + self.temp_sub_dir.name, "file://" + self.temp_sub_dir_2.name]) @@ -82,6 +91,20 @@ def test_fsspec_file_lister_iterdatapipe_with_list(self): # check all file paths within sub_folder are listed self.assertEqual(file_lister, temp_files) + # checks for functional API + datapipe = IterableWrapper(["file://" + self.temp_sub_dir.name, "file://" + self.temp_sub_dir_2.name]) + datapipe = datapipe.list_files_by_fsspec() + res = list(map(lambda path: path.split("://")[1], datapipe)) + res.sort() + temp_files = list( + map( + lambda file: fsspec.implementations.local.make_path_posix(file), + self.temp_sub_files + self.temp_sub_files_2, + ) + ) + temp_files.sort() + self.assertEqual(res, temp_files) + @skipIfNoFSSpec def test_fsspec_file_loader_iterdatapipe(self): datapipe1 = FSSpecFileLister(root="file://" + self.temp_sub_dir.name) diff --git a/test/test_local_io.py b/test/test_local_io.py index 3be37a5ed..29f8de3b5 100644 --- a/test/test_local_io.py +++ b/test/test_local_io.py @@ -660,6 +660,11 @@ def test_io_path_file_lister_iterdatapipe(self): for path in datapipe: self.assertTrue(path in self.temp_sub_files) + datapipe = IterableWrapper([self.temp_sub_dir.name]) + datapipe = datapipe.list_files_by_iopath() + for path in datapipe: + self.assertTrue(path in self.temp_sub_files) + @skipIfNoIoPath def test_io_path_file_lister_iterdatapipe_with_list(self): datapipe = IoPathFileLister(root=[self.temp_sub_dir.name, self.temp_sub_dir_2.name]) @@ -672,6 +677,12 @@ def test_io_path_file_lister_iterdatapipe_with_list(self): # check all file paths within sub_folder are listed self.assertEqual(file_lister, all_temp_files) + datapipe = IterableWrapper([self.temp_sub_dir.name, self.temp_sub_dir_2.name]) + datapipe = datapipe.list_files_by_iopath() + results = list(datapipe) + results.sort() + self.assertEqual(results, all_temp_files) + @skipIfNoIoPath def test_io_path_file_loader_iterdatapipe(self): datapipe1 = IoPathFileLister(root=self.temp_sub_dir.name) diff --git a/torchdata/datapipes/iter/load/fsspec.py b/torchdata/datapipes/iter/load/fsspec.py index 2ded99eee..14e9128d6 100644 --- a/torchdata/datapipes/iter/load/fsspec.py +++ b/torchdata/datapipes/iter/load/fsspec.py @@ -33,6 +33,7 @@ def _assert_fsspec() -> None: ) +@functional_datapipe("list_files_by_fsspec") class FSSpecFileListerIterDataPipe(IterDataPipe[str]): r""" Lists the contents of the directory at the provided ``root`` pathname or URL, diff --git a/torchdata/datapipes/iter/load/iopath.py b/torchdata/datapipes/iter/load/iopath.py index 0a04d6517..61a3db2dd 100644 --- a/torchdata/datapipes/iter/load/iopath.py +++ b/torchdata/datapipes/iter/load/iopath.py @@ -39,6 +39,7 @@ def _create_default_pathmanager(): return pathmgr +@functional_datapipe("list_files_by_iopath") class IoPathFileListerIterDataPipe(IterDataPipe[str]): r""" Lists the contents of the directory at the provided ``root`` pathname or URL,