Code to reproduce the results in Data Augmentation for Compositional Data: Advancing Predictive Models of the Microbiome.
To simply run the augmentations on some data:
def aitchison_mixup(X_train, y_train, factor=10, weight=0.5):
"""Applies Aitchison Mixup to some training data.
Args:
X_train: model inputs
y_train: model outputs
factor: the enlargement factor by which to augment X and y.
If X, y have n rows, the augmented data will have factor * n.
weight: the total weight of all the augmented data generated.
For example, if weight=0.5, then all the augmented data will
weigh the same as all the original training data.
Typically, this means the synthetic samples are downweighted
relative to the original data.
Returns:
X_aug: model inputs including both those from the original
training data and those generated by data augmentation.
y_aug: model outputs including both those from the original
training data and those generated by data augmentation.
w_aug: sample weights for each sample in X_aug and y_aug.
Typically, the sample weights corresponding to samples
from the original training data will have high weight,
those corresponding to synthetic datapoints will have
low weight.
"""
X = X_train.copy()
y = y_train.copy()
w = np.ones_like(y)
for val in y_train.unique():
idxs = y_train == val
X_temp = X_train[idxs, :]
n = X_temp.shape[0]
n_aug = int(factor * n) - n
lam = np.random.rand(n_aug).reshape([-1, 1])
idx1 = np.random.choice(n, size=n_aug)
idx2 = np.random.choice(n, size=n_aug)
# Take convex combination
X_aug = lam * X_temp[idx1, :] + (1 - lam) * X_temp[idx2, :]
X = np.concatenate([X, X_aug], axis=0)
y = np.concatenate([y, np.repeat(val, n_aug)])
w = np.concatenate([w, np.repeat(weight / (1 - weight) * X_train.shape[0] / n_aug, n_aug)])
# Shuffle data
n = X.shape[0]
idx = np.arange(n)
np.random.shuffle(idx)
return X[idx], y[idx], w[idx]
def compositional_feature_dropout(X_train, y_train, factor=10, weight=0.5):
"""Applies Compositional Feature Dropout to some training data.
Args:
X_train: model inputs
y_train: model outputs
factor: the enlargement factor by which to augment X and y.
If X, y have n rows, the augmented data will have factor * n.
weight: the total weight of all the augmented data generated.
For example, if weight=0.5, then all the augmented data will
weigh the same as all the original training data.
Typically, this means the synthetic samples are downweighted
relative to the original data.
Returns:
X_aug: model inputs including both those from the original
training data and those generated by data augmentation.
y_aug: model outputs including both those from the original
training data and those generated by data augmentation.
w_aug: sample weights for each sample in X_aug and y_aug.
Typically, the sample weights corresponding to samples
from the original training data will have high weight,
those corresponding to synthetic datapoints will have
low weight.
"""
X = X_train.copy()
y = y_train.copy()
w = np.ones_like(y)
for val in [0, 1]:
idxs = y_train == val
X_temp = X_train[idxs, :]
n = X_temp.shape[0]
n_aug = int(factor * n) - n
X_aug = []
y_aug = []
p = np.random.rand(n_aug)
idx = np.random.choice(n, size= n_aug)
mask = np.random.binomial(1, p, [X_temp.shape[1], n_aug]).T
X_new = X_temp[idx, :].copy()
X_new[mask.astype('bool')] = 1
X_aug.append(X_new)
y_aug.append(y_train[idx])
X_aug = X_new
y_aug = y_aug
X = np.concatenate([X, X_aug], axis=0)
y = np.concatenate([y, np.repeat(val, n_aug)])
w = np.concatenate([w, np.repeat(weight / (1 - weight) * X_train.shape[0] / n_aug, n_aug)])
# Shuffle data
n = X.shape[0]
idx = np.arange(n)
np.random.shuffle(idx)
return X[idx], y[idx], w[idx]
def compositional_cutmix(X_train, y_train, factor=10, weight=0.5):
"""Applies Compositional CutMix to some training data.
Args:
X_train: model inputs
y_train: model outputs
factor: the enlargement factor by which to augment X and y.
If X, y have n rows, the augmented data will have factor * n.
weight: the total weight of all the augmented data generated.
For example, if weight=0.5, then all the augmented data will
weigh the same as all the original training data.
Typically, this means the synthetic samples are downweighted
relative to the original data.
Returns:
X_aug: model inputs including both those from the original
training data and those generated by data augmentation.
y_aug: model outputs including both those from the original
training data and those generated by data augmentation.
w_aug: sample weights for each sample in X_aug and y_aug.
Typically, the sample weights corresponding to samples
from the original training data will have high weight,
those corresponding to synthetic datapoints will have
low weight.
"""
X = X_train.copy()
y = y_train.copy()
w = np.ones_like(y)
for val in [0, 1]:
idxs = y_train == val
X_temp = X_train[idxs, :]
n = X_temp.shape[0]
n_aug = int(factor * n - n
idx1 = np.random.choice(n, size=n_aug)
idx2 = np.random.choice(n, size=n_aug)
p = np.random.rand(n_aug)
mask = np.random.binomial(1, p, [X_temp.shape[1], n_aug]).T
X_aug = mask * X_temp[idx1, :] + (1 - mask) * X_temp[idx2, :]
X_aug = X_aug / X_aug.sum(axis=1, keepdims=True)
X = np.concatenate([X, X_aug], axis=0)
y = np.concatenate([y, np.repeat(val, n_aug)])
w = np.concatenate([w, np.repeat(weight / (1 - weight) * X_train.shape[0] / n_aug, n_aug)])
# Shuffle data
n = X.shape[0]
idx = np.arange(n)
np.random.shuffle(idx)
return X[idx], y[idx], w[idx]