diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index b377542e402..1504abf80e2 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -29,7 +29,7 @@ // If missing or the empty string, the tool will be automatically // determined by looking for tools on the PATH environment // variable. - "environment_type": "mamba", + "environment_type": "rattler", "conda_channels": ["conda-forge"], // timeout in seconds for installing any dependencies in environment @@ -76,7 +76,7 @@ // https://github.com/airspeed-velocity/asv/issues/1389#issuecomment-2076131185 "build_command": [ "python -m build", - "python -mpip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" + "python -m pip wheel --no-deps --no-build-isolation --no-index -w {build_cache_dir} {build_dir}" ], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 772d888306c..5efacd9793e 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -5,6 +5,22 @@ from . import requires_dask +class Concat1d: + """Benchmark concatenating large datasets""" + + def setup(self) -> None: + self.data_arrays = [ + xr.DataArray(data=np.zeros(4 * 1024 * 1024, dtype=np.int8), dims=["x"]) + for _ in range(10) + ] + + def time_concat(self) -> None: + xr.concat(self.data_arrays, dim="x") + + def peakmem_concat(self) -> None: + xr.concat(self.data_arrays, dim="x") + + class Combine1d: """Benchmark concatenating and merging large datasets""" diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml index 0e5c7f4b489..f0a52541505 100644 --- a/ci/requirements/environment-benchmark.yml +++ b/ci/requirements/environment-benchmark.yml @@ -12,6 +12,7 @@ dependencies: - numba - numbagg - numexpr + - py-rattler - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105 - opt_einsum - packaging diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2617dee17bf..a160d2cc84e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,6 +40,13 @@ Bug Fixes - Fix indexing with empty arrays for scipy & h5netcdf backends which now resolves to empty slices (:issue:`10867`, :pull:`10870`). By `Kai Mühlbauer `_ +Performance +~~~~~~~~~~~ + +- Speedup and reduce memory usage of :py:func:`concat`. Magnitude of improvement scales + with size of the concatenation dimension. By `Deepak Cherian `_. + :issue:`10864` :pull:`10866`. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/structure/concat.py b/xarray/structure/concat.py index bb2d96c10be..69b05880e3d 100644 --- a/xarray/structure/concat.py +++ b/xarray/structure/concat.py @@ -745,10 +745,11 @@ def get_indexes(name): yield PandasIndex(data, dim_name, coord_dtype=var.dtype) # create concatenation index, needed for later reindexing + # use np.cumulative_sum(concat_dim_lengths, include_initial=True) when we support numpy>=2 file_start_indexes = np.append(0, np.cumsum(concat_dim_lengths)) - concat_index = np.arange(file_start_indexes[-1]) - concat_index_size = concat_index.size + concat_index_size = file_start_indexes[-1] variable_index_mask = np.ones(concat_index_size, dtype=bool) + variable_reindexer = None # stack up each variable and/or index to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby. @@ -776,7 +777,6 @@ def get_indexes(name): end = file_start_indexes[i + 1] variable_index_mask[slice(start, end)] = False - variable_index = concat_index[variable_index_mask] vars = ensure_common_dims(variables, var_concat_dim_length) # Try to concatenate the indexes, concatenate the variables when no index @@ -807,12 +807,22 @@ def get_indexes(name): vars, dim_name, positions, combine_attrs=combine_attrs ) # reindex if variable is not present in all datasets - if len(variable_index) < concat_index_size: + if not variable_index_mask.all(): + if variable_reindexer is None: + # allocate only once + variable_reindexer = np.empty( + concat_index_size, + # cannot use uint since we need -1 as a sentinel for reindexing + dtype=np.min_scalar_type(-concat_index_size), + ) + np.cumsum(variable_index_mask, out=variable_reindexer) + # variable_index_mask is boolean, so the first element is 1. + # offset by 1 to start at 0. + variable_reindexer -= 1 + variable_reindexer[~variable_index_mask] = -1 combined_var = reindex_variables( variables={name: combined_var}, - dim_pos_indexers={ - dim_name: pd.Index(variable_index).get_indexer(concat_index) - }, + dim_pos_indexers={dim_name: variable_reindexer}, fill_value=fill_value, )[name] result_vars[name] = combined_var