Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "mamba",
"environment_type": "rattler",
"conda_channels": ["conda-forge"],

// timeout in seconds for installing any dependencies in environment
Expand Down Expand Up @@ -76,7 +76,7 @@
// https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185
"build_command": [
"python -m build",
"python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
"python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}"
],
// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
Expand Down
16 changes: 16 additions & 0 deletions asv_bench/benchmarks/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@
from . import requires_dask


class Concat1d:
"""Benchmark concatenating large datasets"""

def setup(self) -> None:
self.data_arrays = [
xr.DataArray(data=np.zeros(4 * 1024 * 1024, dtype=np.int8), dims=["x"])
for _ in range(10)
]

def time_concat(self) -> None:
xr.concat(self.data_arrays, dim="x")

def peakmem_concat(self) -> None:
xr.concat(self.data_arrays, dim="x")


class Combine1d:
"""Benchmark concatenating and merging large datasets"""

Expand Down
1 change: 1 addition & 0 deletions ci/requirements/environment-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- numba
- numbagg
- numexpr
- py-rattler
- numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105
- opt_einsum
- packaging
Expand Down
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ Bug Fixes
- Fix indexing with empty arrays for scipy & h5netcdf backends which now resolves to empty slices (:issue:`10867`, :pull:`10870`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_

Performance
~~~~~~~~~~~

- Speedup and reduce memory usage of :py:func:`concat`. Magnitude of improvement scales
with size of the concatenation dimension. By `Deepak Cherian <https://github.com/dcherian>`_.
:issue:`10864` :pull:`10866`.

Documentation
~~~~~~~~~~~~~

Expand Down
24 changes: 17 additions & 7 deletions xarray/structure/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,10 +745,11 @@ def get_indexes(name):
yield PandasIndex(data, dim_name, coord_dtype=var.dtype)

# create concatenation index, needed for later reindexing
# use np.cumulative_sum(concat_dim_lengths, include_initial=True) when we support numpy>=2
file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths))
concat_index = np.arange(file_start_indexes[-1])
concat_index_size = concat_index.size
concat_index_size = file_start_indexes[-1]
variable_index_mask = np.ones(concat_index_size, dtype=bool)
variable_reindexer = None

# stack up each variable and/or index to fill-out the dataset (in order)
# n.b. this loop preserves variable order, needed for groupby.
Expand Down Expand Up @@ -776,7 +777,6 @@ def get_indexes(name):
end = file_start_indexes[i + 1]
variable_index_mask[slice(start, end)] = False

variable_index = concat_index[variable_index_mask]
vars = ensure_common_dims(variables, var_concat_dim_length)

# Try to concatenate the indexes, concatenate the variables when no index
Expand Down Expand Up @@ -807,12 +807,22 @@ def get_indexes(name):
vars, dim_name, positions, combine_attrs=combine_attrs
)
# reindex if variable is not present in all datasets
if len(variable_index) < concat_index_size:
if not variable_index_mask.all():
if variable_reindexer is None:
# allocate only once
variable_reindexer = np.empty(
concat_index_size,
# cannot use uint since we need -1 as a sentinel for reindexing
dtype=np.min_scalar_type(-concat_index_size),
)
np.cumsum(variable_index_mask, out=variable_reindexer)
# variable_index_mask is boolean, so the first element is 1.
# offset by 1 to start at 0.
variable_reindexer -= 1
variable_reindexer[~variable_index_mask] = -1
combined_var = reindex_variables(
variables={name: combined_var},
dim_pos_indexers={
dim_name: pd.Index(variable_index).get_indexer(concat_index)
},
dim_pos_indexers={dim_name: variable_reindexer},
fill_value=fill_value,
)[name]
result_vars[name] = combined_var
Expand Down
Loading