Skip to content

Commit

Permalink
Deduplicate filter_obs/filter_var code
Browse files Browse the repository at this point in the history
  • Loading branch information
gtca committed Oct 16, 2024
1 parent 04489bf commit 1909f51
Showing 1 changed file with 90 additions and 168 deletions.
258 changes: 90 additions & 168 deletions muon/_core/preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,23 +657,24 @@ def intersect_obs(mdata: MuData):
return


# Utility functions: filtering observations
# Utility functions: filtering observations or variables


def filter_obs(
data: Union[AnnData, MuData], var: Union[str, Sequence[str]], func: Optional[Callable] = None
def _filter_attr(
data: Union[AnnData, MuData],
attr: Literal["obs", "var"],
key: Union[str, Sequence[str]],
func: Optional[Callable] = None,
) -> None:
"""
Filter observations (samples or cells) in-place
using any column in .obs or in .X.
Filter observations or variables in-place.
Parameters
----------
data: AnnData or MuData
AnnData or MuData object
var: str or Sequence[str]
Column name in .obs or in .X to be used for filtering.
Alternatively, obs_names can be provided directly.
key: str or Sequence[str]
Names or key to filter
func
Function to apply to the variable used for filtering.
If the variable is of type boolean and func is an identity function,
Expand All @@ -694,59 +695,76 @@ def filter_obs(
"MuData object is backed. The requested subset of the .X matrices of its modalities will be read into memory, and the object will not be backed anymore."
)

if isinstance(var, str):
if var in data.obs.columns:
assert attr in ("obs", "var"), "Attribute has to be either 'obs' or 'var'."

df = getattr(data, attr)
names = getattr(data, f"{attr}_names")
other = "obs" if attr == "var" else "var"
other_names = getattr(data, f"{other}_names")
attrm = getattr(data, f"{attr}m")
attrp = getattr(data, f"{attr}p")

if isinstance(key, str):
if key in df.columns:
if func is None:
if data.obs[var].dtypes.name == "bool":
if df[key].dtypes.name == "bool":

def func(x):
return x

else:
raise ValueError(f"Function has to be provided since {var} is not boolean")
obs_subset = func(data.obs[var].values)
elif var in data.var_names:
obs_subset = func(data.X[:, np.where(data.var_names == var)[0]].reshape(-1))
raise ValueError(f"Function has to be provided since {key} is not boolean")
subset = func(df[key].values)
elif key in other_names:
if attr == "obs":
subset = func(data.X[:, np.where(other_names == key)[0]].reshape(-1))
else:
subset = func(data.X[np.where(other_names == key)[0], :].reshape(-1))
else:
raise ValueError(
f"Column name from .obs or one of the var_names was expected but got {var}."
f"Column name from .{attr} or one of the {other}_names was expected but got {key}."
)
else:
if func is None:
if np.array(var).dtype == bool:
obs_subset = np.array(var)
if np.array(key).dtype == bool:
subset = np.array(key)
else:
obs_subset = data.obs_names.isin(var)
subset = names.isin(key)
else:
raise ValueError("When providing obs_names directly, func has to be None.")
raise ValueError(f"When providing {attr}_names directly, func has to be None.")

if isinstance(data, AnnData):
# Collect elements to subset
# NOTE: accessing them after subsetting .obs
# NOTE: accessing them after subsetting .obs/.var
# will fail due to _validate_value()
obsm = dict(data.obsm)
obsp = dict(data.obsp)
attrm = dict(attrm)
attrp = dict(attrp)

# Subset .obs
data._obs = data.obs[obs_subset]
data._n_obs = data.obs.shape[0]
# Subset .obs/.var
setattr(data, f"_{attr}", df[subset])

# Subset .obsm
for k, v in obsm.items():
obsm[k] = v[obs_subset]
data.obsm = obsm
# Subset .obsm/.varm
for k, v in attrm.items():
attrm[k] = v[subset]
setattr(data, f"{attr}m", attrm)

# Subset .obsp
for k, v in obsp.items():
obsp[k] = v[obs_subset][:, obs_subset]
data.obsp = obsp
# Subset .obsp/.obsp
for k, v in attrp.items():
attrp[k] = v[subset][:, subset]
setattr(data, f"{attr}p", attrp)

# Subset .X
if data._X is not None:
try:
data._X = data.X[obs_subset, :]
if attr == "obs":
data._X = data.X[subset, :]
else:
data._X = data.X[:, subset]
except TypeError:
data._X = data.X[np.where(obs_subset)[0], :]
if attr == "obs":
data._X = data.X[np.where(subset)[0], :]
else:
data._X = data.X[:, np.where(subset)[0]]
# For some h5py versions, indexing arrays must have integer dtypes
# https://github.com/h5py/h5py/issues/1847

Expand All @@ -756,165 +774,69 @@ def func(x):

# Subset layers
for layer in data.layers:
data.layers[layer] = data.layers[layer][obs_subset, :]
if attr == "obs":
data.layers[layer] = data.layers[layer][subset, :]
else:
data.layers[layer] = data.layers[layer][:, subset]

# Subset raw
if data.raw is not None:
data.raw._X = data.raw.X[obs_subset, :]
data.raw._n_obs = data.raw.X.shape[0]
# Subset raw - only when subsetting obs
if attr == "obs" and data.raw is not None:
data.raw._X = data.raw.X[subset, :]

else:
# Subset .obs
data._obs = data.obs[obs_subset]
data._n_obs = data.obs.shape[0]
attrmap = getattr(data, f"{attr}map")

# Subset .obs/.var
setattr(data, f"_{attr}", df[subset])

# Subset .obsm
for k, v in data.obsm.items():
data.obsm[k] = v[obs_subset]
# Subset .obsm/.varm
for k, v in attrm.items():
attrm[k] = v[subset]
setattr(data, f"{attr}m", attrm)

# Subset .obsp
for k, v in data.obsp.items():
data.obsp[k] = v[obs_subset][:, obs_subset]
# Subset .obsp/.varp
for k, v in attrp.items():
attrp[k] = v[subset][:, subset]
setattr(data, f"{attr}p", attrp)

# filter_obs() for each modality
# _filter_attr() for each modality
for m, mod in data.mod.items():
obsmap = data.obsmap[m][obs_subset]
obsidx = obsmap > 0
orig_obs = mod.obs.copy()
filter_obs(mod, mod.obs_names[obsmap[obsidx] - 1])
data.mod[m]._remove_unused_categories(orig_obs, mod.obs, mod.uns)
maporder = np.argsort(obsmap[obsidx])
map_subset = attrmap[m][subset]
attridx = map_subset > 0
orig_attr = getattr(mod, attr).copy()
mod_names = getattr(mod, f"{attr}_names")
_filter_attr(mod, attr, mod_names[map_subset[attridx] - 1])
data.mod[m]._remove_unused_categories(orig_attr, getattr(mod, attr), mod.uns)
maporder = np.argsort(map_subset[attridx])
nobsmap = np.empty(maporder.size)
nobsmap[maporder] = np.arange(1, maporder.size + 1)
obsmap[obsidx] = nobsmap
data.obsmap[m] = obsmap
map_subset[attridx] = nobsmap
getattr(data, f"{attr}map")[m] = map_subset

return


# Utility functions: filtering variables


def filter_var(
def filter_obs(
data: Union[AnnData, MuData], var: Union[str, Sequence[str]], func: Optional[Callable] = None
):
) -> None:
"""
Filter variables (features, e.g. genes) in-place
using any column in .var or row in .X.
Filter observations (samples or cells) in-place
using any column in .obs or in .X.
Parameters
----------
data: AnnData or MuData
AnnData or MuData object
var: str or Sequence[str]
Column name in .var or row name in .X to be used for filtering.
Alternatively, var_names can be provided directly.
Column name in .obs or in .X to be used for filtering.
Alternatively, obs_names can be provided directly.
func
Function to apply to the variable used for filtering.
If the variable is of type boolean and func is an identity function,
the func argument can be omitted.
"""

if data.is_view:
raise ValueError(
"The provided adata is a view. In-place filtering does not operate on views."
)
if data.isbacked:
if isinstance(data, AnnData):
warnings.warn(
"AnnData object is backed. The requested subset of the matrix .X will be read into memory, and the object will not be backed anymore."
)
else:
warnings.warn(
"MuData object is backed. The requested subset of the .X matrices of its modalities will be read into memory, and the object will not be backed anymore."
)

if isinstance(var, str):
if var in data.var.columns:
if func is None:
if data.var[var].dtypes.name == "bool":

def func(x):
return x

else:
raise ValueError(f"Function has to be provided since {var} is not boolean")
var_subset = func(data.var[var].values)
elif var in data.obs_names:
var_subset = func(data.X[:, np.where(data.obs_names == var)[0]].reshape(-1))
else:
raise ValueError(
f"Column name from .var or one of the obs_names was expected but got {var}."
)
else:
if func is None:
var_subset = var if np.array(var).dtype == bool else data.var_names.isin(var)
else:
raise ValueError("When providing var_names directly, func has to be None.")

if isinstance(data, AnnData):
# Collect elements to subset
# NOTE: accessing them after subsetting .var
# will fail due to _validate_value()
varm = dict(data.varm)
varp = dict(data.varp)

# Subset .obs
data._var = data.var[var_subset]
data._n_vars = data.var.shape[0]

# Subset .obsm
for k, v in varm.items():
varm[k] = v[var_subset]
data.varm = varm

# Subset .obsp
for k, v in varp.items():
varp[k] = v[var_subset][:, var_subset]
data.varp = varp

# Subset .X
try:
data._X = data.X[:, var_subset]
except TypeError:
data._X = data.X[:, np.where(var_subset)[0]]
# For some h5py versions, indexing arrays must have integer dtypes
# https://github.com/h5py/h5py/issues/1847
if data.isbacked:
data.file.close()
data.filename = None

# Subset layers
for layer in data.layers:
data.layers[layer] = data.layers[layer][:, var_subset]

# NOTE: .raw is not subsetted

else:
# Subset .var
data._var = data.var[var_subset]
data._n_vars = data.var.shape[0]

# Subset .varm
for k, v in data.varm.items():
data.varm[k] = v[var_subset]

# Subset .varp
for k, v in data.varp.items():
data.varp[k] = v[var_subset][:, var_subset]

# filter_var() for each modality
for m, mod in data.mod.items():
varmap = data.varmap[m][var_subset]
varidx = varmap > 0
orig_var = mod.var.copy()
filter_var(mod, mod.var_names[varmap[varidx] - 1])
data.mod[m]._remove_unused_categories(orig_var, mod.var, mod.uns)
maporder = np.argsort(varmap[varidx])
nvarmap = np.empty(maporder.size)
nvarmap[maporder] = np.arange(1, maporder.size + 1)
varmap[varidx] = nvarmap
data.varmap[m] = varmap
_filter_attr(data, "obs", var, func)

return

Expand Down

0 comments on commit 1909f51

Please sign in to comment.