Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace existing thresholding with micromacro #328

Merged
merged 4 commits into from
Sep 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 67 additions & 84 deletions libmultilabel/linear/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,15 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat
def train_thresholding(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
) -> FlatModel:
"""Trains a linear model for multilabel data using a one-vs-rest strategy
and cross-validation to pick optimal decision thresholds for Macro-F1.
Outperforms train_1vsrest in most aspects at the cost of higher
time complexity.
See user guide for more details.
"""Trains a linear model for multi-label data using a one-vs-rest strategy
and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
Outperforms train_1vsrest in most aspects at the cost of higher time complexity
due to an internal cross-validation.

This method is the micromacro-freq approach from this CIKM 2023 paper:
`"On the Thresholding Strategy for Infrequent Labels in Multi-label Classification"
<https://www.csie.ntu.edu.tw/~cjlin/papers/thresholding/smooth_acm.pdf>`_
(see Section 4.3 and Supplementary D).

Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
Expand All @@ -162,7 +166,6 @@ def train_thresholding(
Returns:
A model which can be used in predict_values.
"""
# Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
x, options, bias = _prepare_options(x, options)

y = y.tocsc()
Expand All @@ -172,138 +175,118 @@ def train_thresholding(
thresholds = np.zeros(num_class)

if verbose:
logging.info(f"Training thresholding model on {num_class} labels")
for i in tqdm(range(num_class), disable=not verbose):
logging.info("Training thresholding model on %s labels", num_class)

num_positives = np.sum(y, 2)
label_order = np.flip(np.argsort(num_positives)).flat

# accumulated counts for micro
stats = {"tp": 0, "fp": 0, "fn": 0, "labels": 0}

for i in tqdm(label_order, disable=not verbose):
yi = y[:, i].toarray().reshape(-1)
w, t = _thresholding_one_label(2 * yi - 1, x, options)
w, t, stats = _micromacro_one_label(2 * yi - 1, x, options, stats)
weights[:, i] = w.ravel()
thresholds[i] = t

return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)


def _thresholding_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> tuple[np.ndarray, float]:
"""Outer cross-validation for thresholding on a single label.
def _micromacro_one_label(
y: np.ndarray, x: sparse.csr_matrix, options: str, stats: dict
) -> tuple[np.ndarray, float, dict]:
"""Perform cross-validation to select the threshold for a label.

Args:
y (np.ndarray): A +1/-1 array with dimensions number of instances * 1.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
options (str): The option string passed to liblinear.
stats (dict): A dictionary containing information needed to calculate Micro-F1.
It includes the accumulated number of true positives, false positives, false
negatives, and the number of labels processed.

Returns:
tuple[np.ndarray, float]: tuple of the weights and threshold.
tuple[np.ndarray, float, dict]: the weights, threshold, and the updated stats for calculating
Micro-F1.
"""
fbr_list = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])

nr_fold = 3
thresholds = np.zeros(nr_fold)

l = y.shape[0]

perm = np.random.permutation(l)

f_list = np.zeros_like(fbr_list)

for fold in range(nr_fold):
mask = np.zeros_like(perm, dtype="?")
mask[np.arange(int(fold * l / nr_fold), int((fold + 1) * l / nr_fold))] = 1
val_idx = perm[mask]
train_idx = perm[mask != True]

scutfbr_w, scutfbr_b_list = _scutfbr(y[train_idx], x[train_idx], fbr_list, options)
wTx = (x[val_idx] * scutfbr_w).A1

for i in range(fbr_list.size):
F = _fmeasure(y[val_idx], 2 * (wTx > -scutfbr_b_list[i]) - 1)
f_list[i] += F

best_fbr = fbr_list[::-1][np.argmax(f_list[::-1])] # last largest
if np.max(f_list) == 0:
best_fbr = np.min(fbr_list)
tp_sum = 0
fp_sum = 0
fn_sum = 0
stats["labels"] += 1

# final model
w, b_list = _scutfbr(y, x, np.array([best_fbr]), options)

return w, b_list[0]


def _scutfbr(y: np.ndarray, x: sparse.csr_matrix, fbr_list: list[float], options: str) -> tuple[np.matrix, np.ndarray]:
"""Inner cross-validation for SCutfbr heuristic.

Args:
y (np.ndarray): A +1/-1 array with dimensions number of instances * 1.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
fbr_list (list[float]): list of fbr values.
options (str): The option string passed to liblinear.

Returns:
tuple[np.matrix, np.ndarray]: tuple of weights and threshold candidates.
"""

b_list = np.zeros_like(fbr_list)

nr_fold = 3
def micro_plus_macro(tp, fp, fn):
# Because the F-measure of other labels are constants and thus does not affect optimization,
# we ignore them when calculating macro-F.
macro = np.nan_to_num((2 * tp) / (2 * tp + fp + fn)) / stats["labels"]
micro = np.nan_to_num((2 * (tp + stats["tp"])) / (2 * (tp + stats["tp"]) + fp + fn + stats["fp"] + stats["fn"]))
return micro + macro

l = y.shape[0]

perm = np.random.permutation(l)

for fold in range(nr_fold):
mask = np.zeros_like(perm, dtype="?")
mask[np.arange(int(fold * l / nr_fold), int((fold + 1) * l / nr_fold))] = 1
mask[np.arange(int(fold * l / nr_fold), int((fold + 1) * l / nr_fold))] = True
val_idx = perm[mask]
train_idx = perm[mask != True]
train_idx = perm[np.logical_not(mask)]

w = _do_train(y[train_idx], x[train_idx], options)
wTx = (x[val_idx] * w).A1
scut_b = 0.0
start_F = _fmeasure(y[val_idx], 2 * (wTx > -scut_b) - 1)

# stableness to match the MATLAB implementation
sorted_wTx_index = np.argsort(wTx, kind="stable")
sorted_wTx = wTx[sorted_wTx_index]

# ignore warning for 0/0 when calculating F-measures
prev_settings = np.seterr("ignore")

tp = np.sum(y[val_idx] == 1)
fp = val_idx.size - tp
fn = 0
best_obj = micro_plus_macro(tp, fp, fn)
best_tp, best_fp, best_fn = tp, fp, fn
cut = -1
best_F = 2 * tp / (2 * tp + fp + fn)

y_val = y[val_idx]

# following MATLAB implementation to suppress NaNs
prev_settings = np.seterr("ignore")
for i in range(val_idx.size):
if y_val[sorted_wTx_index[i]] == -1:
fp -= 1
else:
tp -= 1
fn += 1

# There will be NaNs, but the behaviour is correct
F = 2 * tp / (2 * tp + fp + fn)
obj = micro_plus_macro(tp, fp, fn)

if F >= best_F:
best_F = F
if obj >= best_obj:
best_obj = obj
best_tp, best_fp, best_fn = tp, fp, fn
cut = i
np.seterr(**prev_settings)

if best_F > start_F:
if cut == -1: # i.e. all 1 in scut
scut_b = np.nextafter(-sorted_wTx[0], np.inf) # predict all 1
elif cut == val_idx.size - 1:
scut_b = np.nextafter(-sorted_wTx[-1], np.inf)
else:
scut_b = -(sorted_wTx[cut] + sorted_wTx[cut + 1]) / 2
if cut == -1: # i.e. all 1 in scut
thresholds[fold] = np.nextafter(sorted_wTx[0], -np.inf) # predict all 1
elif cut == val_idx.size - 1:
thresholds[fold] = np.nextafter(sorted_wTx[-1], np.inf)
else:
thresholds[fold] = (sorted_wTx[cut] + sorted_wTx[cut + 1]) / 2

F = _fmeasure(y_val, 2 * (wTx > -scut_b) - 1)
tp_sum += best_tp
fp_sum += best_fp
fn_sum += best_fn

for i in range(fbr_list.size):
if F > fbr_list[i]:
b_list[i] += scut_b
else:
b_list[i] -= np.max(wTx)
# In FlatModel.predict_values, the threshold is added to the decision value.
# Therefore, we need to make it negative here.
threshold = -thresholds.mean()
stats["tp"] += tp_sum
stats["fp"] += fp_sum
stats["fn"] += fn_sum

b_list = b_list / nr_fold
return _do_train(y, x, options), b_list
return _do_train(y, x, options), threshold, stats


def _do_train(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.matrix:
Expand Down