Skip to content

Commit

Permalink
Grab meta from arrow rather than dask.dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
mrocklin committed Oct 30, 2023
1 parent 3fc1904 commit d037c82
Showing 1 changed file with 10 additions and 8 deletions.
18 changes: 10 additions & 8 deletions dask_expr/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,12 +448,6 @@ def _filtered_task(self, i):

@staticmethod
def to_pandas(t: pa.Table) -> pd.DataFrame:
def types_mapper(pyarrow_dtype):
if pyarrow_dtype == pa.string():
return pd.StringDtype("pyarrow")
if "decimal" in str(pyarrow_dtype) or "date32" in str(pyarrow_dtype):
return pd.ArrowDtype(pyarrow_dtype)

df = t.to_pandas(
use_threads=False,
ignore_metadata=False,
Expand Down Expand Up @@ -504,6 +498,13 @@ def _simplify_up(self, parent):
# return Literal(sum(_lengths))


def types_mapper(pyarrow_dtype):
if pyarrow_dtype == pa.string():
return pd.StringDtype("pyarrow")
if "decimal" in str(pyarrow_dtype) or "date32" in str(pyarrow_dtype):
return pd.ArrowDtype(pyarrow_dtype)


@functools.lru_cache
def meta_and_filenames(path):
if str(path).startswith("s3://"):
Expand All @@ -520,9 +521,10 @@ def meta_and_filenames(path):
else:
filenames = [path] # TODO: split by row group

import dask.dataframe as dd
ds = pq.ParquetDataset(path)
t = pa.Table.from_pylist([], schema=ds.schema)
meta = t.to_pandas(types_mapper=types_mapper)

meta = dd.read_parquet(path)._meta
return meta, filenames


Expand Down

0 comments on commit d037c82

Please sign in to comment.