diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index 4dc803b3259..132201a349e 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -111,6 +111,7 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs): return read_csv_without_chunksize(path, **kwargs) dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") + usecols = kwargs.pop("usecols", None) meta = dask_reader(filenames[0], **kwargs)._meta dsk = {} @@ -130,11 +131,14 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs): "names" ] = meta.columns # no header in the middle of the file kwargs2["header"] = None + kwargs2["usecols"] = usecols dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) i += 1 divisions = [None] * (len(dsk) + 1) + if usecols is not None: + meta = meta[usecols] return dd.core.new_dd_object(dsk, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index db1c47c8819..98061f6c624 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -134,3 +134,19 @@ def test_read_csv_chunksize_none(tmp_path, compression, size): df.to_csv(path, index=False, compression=compression) df2 = dask_cudf.read_csv(path, chunksize=None, dtype=typ) dd.assert_eq(df, df2) + + +def test_csv_reader_usecols(tmp_path): + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4] * 100, + "b": ["a", "b", "c", "d"] * 100, + "c": [10, 11, 12, 13] * 100, + } + ) + csv_path = str(tmp_path / "usecols_data.csv") + df.to_csv(csv_path, index=False) + ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5) + ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"]) + + dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)