diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 019873e38..029273cdf 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -1690,6 +1690,7 @@ def concatenate( batch_categories=batch_categories, uns_merge=uns_merge, fill_value=fill_value, + index_unique=index_unique, ) # Backwards compat, ordering columns: diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 9fc8317c0..e3205ce7e 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -432,32 +432,27 @@ def concat( if batch_categories is None: batch_categories = np.arange(len(adatas)).astype(str) - # Combining indexes - obs_names = pd.Index( - np.concatenate( - [ - pd.Series(a.obs_names) + f"{index_unique}{batch}" - for batch, a in zip(batch_categories, adatas) - ] - ) + # Batch column + batch = pd.Categorical.from_codes( + np.repeat(np.arange(len(adatas)), [a.n_obs for a in adatas]), + categories=batch_categories, ) + + # Combining indexes + obs_names = pd.concat([pd.Series(a.obs_names) for a in adatas], ignore_index=True) + if index_unique is not None: + obs_names = obs_names.str.cat(batch.map(str), sep=index_unique) + obs_names = pd.Index(obs_names) + var_names = resolve_index([a.var_names for a in adatas], join=join) reindexers = [ gen_reindexer(var_names, a.var_names, fill_value=fill_value) for a in adatas ] # Obs - # fmt: off - batch = ( - pd.Series( - np.repeat(np.arange(len(adatas)), [a.n_obs for a in adatas]), dtype="category" - ) - .map(dict(zip(np.arange(len(adatas)), batch_categories))) - ) - # fmt: on obs = pd.concat([a.obs for a in adatas], ignore_index=True) obs.index = obs_names - obs[batch_key] = batch.values + obs[batch_key] = batch # Var var = merge_dataframes( diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index f41b27bac..7a9f90748 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -46,7 +46,6 @@ def fix_known_differences(orig, result): orig = orig.copy() result = result.copy() - result.obs_names = result.obs_names.str.extract(r"^(.*)-\d+$", expand=False) result.obs.drop(columns=["batch"], inplace=True) result.strings_to_categoricals() # Should this be implicit in concatenation? @@ -76,7 +75,9 @@ def test_concatenate_roundtrip(join_type, array_type): subsets.append(adata[subset_idx]) remaining = remaining.difference(subset_idx) - result = subsets[0].concatenate(subsets[1:], join=join_type, uns_merge="same") + result = subsets[0].concatenate( + subsets[1:], join=join_type, uns_merge="same", index_unique=None + ) # Correcting for known differences orig, result = fix_known_differences(adata, result)