diff --git a/libmultilabel/linear/linear.py b/libmultilabel/linear/linear.py
index 35485251..f9b4fb84 100644
--- a/libmultilabel/linear/linear.py
+++ b/libmultilabel/linear/linear.py
@@ -147,11 +147,15 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat
 def train_thresholding(
     y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
 ) -> FlatModel:
-    """Trains a linear model for multilabel data using a one-vs-rest strategy
-    and cross-validation to pick optimal decision thresholds for Macro-F1.
-    Outperforms train_1vsrest in most aspects at the cost of higher
-    time complexity.
-    See user guide for more details.
+    """Trains a linear model for multi-label data using a one-vs-rest strategy
+    and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
+    Outperforms train_1vsrest in most aspects at the cost of higher time complexity
+    due to an internal cross-validation.
+
+    This method is the micromacro-freq approach from this CIKM 2023 paper:
+    `"On the Thresholding Strategy for Infrequent Labels in Multi-label Classification"
+    <https://www.csie.ntu.edu.tw/~cjlin/papers/thresholding/smooth_acm.pdf>`_
+    (see Section 4.3 and Supplementary D).
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
@@ -162,7 +166,6 @@ def train_thresholding(
     Returns:
         A model which can be used in predict_values.
     """
-    # Follows the MATLAB implementation at https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/multilabel/
     x, options, bias = _prepare_options(x, options)
 
     y = y.tocsc()
@@ -172,76 +175,55 @@ def train_thresholding(
     thresholds = np.zeros(num_class)
 
     if verbose:
-        logging.info(f"Training thresholding model on {num_class} labels")
-    for i in tqdm(range(num_class), disable=not verbose):
+        logging.info("Training thresholding model on %s labels", num_class)
+
+    num_positives = np.sum(y, 2)
+    label_order = np.flip(np.argsort(num_positives)).flat
+
+    # accumulated counts for micro
+    stats = {"tp": 0, "fp": 0, "fn": 0, "labels": 0}
+
+    for i in tqdm(label_order, disable=not verbose):
         yi = y[:, i].toarray().reshape(-1)
-        w, t = _thresholding_one_label(2 * yi - 1, x, options)
+        w, t, stats = _micromacro_one_label(2 * yi - 1, x, options, stats)
         weights[:, i] = w.ravel()
         thresholds[i] = t
 
     return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
 
 
-def _thresholding_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> tuple[np.ndarray, float]:
-    """Outer cross-validation for thresholding on a single label.
+def _micromacro_one_label(
+    y: np.ndarray, x: sparse.csr_matrix, options: str, stats: dict
+) -> tuple[np.ndarray, float, dict]:
+    """Perform cross-validation to select the threshold for a label.
 
     Args:
         y (np.ndarray): A +1/-1 array with dimensions number of instances * 1.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
         options (str): The option string passed to liblinear.
+        stats (dict): A dictionary containing information needed to calculate Micro-F1.
+            It includes the accumulated number of true positives, false positives, false
+            negatives, and the number of labels processed.
 
     Returns:
-        tuple[np.ndarray, float]: tuple of the weights and threshold.
+        tuple[np.ndarray, float, dict]: the weights, threshold, and the updated stats for calculating
+        Micro-F1.
     """
-    fbr_list = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
 
     nr_fold = 3
+    thresholds = np.zeros(nr_fold)
 
-    l = y.shape[0]
-
-    perm = np.random.permutation(l)
-
-    f_list = np.zeros_like(fbr_list)
-
-    for fold in range(nr_fold):
-        mask = np.zeros_like(perm, dtype="?")
-        mask[np.arange(int(fold * l / nr_fold), int((fold + 1) * l / nr_fold))] = 1
-        val_idx = perm[mask]
-        train_idx = perm[mask != True]
-
-        scutfbr_w, scutfbr_b_list = _scutfbr(y[train_idx], x[train_idx], fbr_list, options)
-        wTx = (x[val_idx] * scutfbr_w).A1
-
-        for i in range(fbr_list.size):
-            F = _fmeasure(y[val_idx], 2 * (wTx > -scutfbr_b_list[i]) - 1)
-            f_list[i] += F
-
-    best_fbr = fbr_list[::-1][np.argmax(f_list[::-1])]  # last largest
-    if np.max(f_list) == 0:
-        best_fbr = np.min(fbr_list)
+    tp_sum = 0
+    fp_sum = 0
+    fn_sum = 0
+    stats["labels"] += 1
 
-    # final model
-    w, b_list = _scutfbr(y, x, np.array([best_fbr]), options)
-
-    return w, b_list[0]
-
-
-def _scutfbr(y: np.ndarray, x: sparse.csr_matrix, fbr_list: list[float], options: str) -> tuple[np.matrix, np.ndarray]:
-    """Inner cross-validation for SCutfbr heuristic.
-
-    Args:
-        y (np.ndarray): A +1/-1 array with dimensions number of instances * 1.
-        x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
-        fbr_list (list[float]): list of fbr values.
-        options (str): The option string passed to liblinear.
-
-    Returns:
-        tuple[np.matrix, np.ndarray]: tuple of weights and threshold candidates.
-    """
-
-    b_list = np.zeros_like(fbr_list)
-
-    nr_fold = 3
+    def micro_plus_macro(tp, fp, fn):
+        # Because the F-measure of other labels are constants and thus does not affect optimization,
+        # we ignore them when calculating macro-F.
+        macro = np.nan_to_num((2 * tp) / (2 * tp + fp + fn)) / stats["labels"]
+        micro = np.nan_to_num((2 * (tp + stats["tp"])) / (2 * (tp + stats["tp"]) + fp + fn + stats["fp"] + stats["fn"]))
+        return micro + macro
 
     l = y.shape[0]
 
@@ -249,28 +231,28 @@ def _scutfbr(y: np.ndarray, x: sparse.csr_matrix, fbr_list: list[float], options
 
     for fold in range(nr_fold):
         mask = np.zeros_like(perm, dtype="?")
-        mask[np.arange(int(fold * l / nr_fold), int((fold + 1) * l / nr_fold))] = 1
+        mask[np.arange(int(fold * l / nr_fold), int((fold + 1) * l / nr_fold))] = True
         val_idx = perm[mask]
-        train_idx = perm[mask != True]
+        train_idx = perm[np.logical_not(mask)]
 
         w = _do_train(y[train_idx], x[train_idx], options)
         wTx = (x[val_idx] * w).A1
-        scut_b = 0.0
-        start_F = _fmeasure(y[val_idx], 2 * (wTx > -scut_b) - 1)
 
-        # stableness to match the MATLAB implementation
         sorted_wTx_index = np.argsort(wTx, kind="stable")
         sorted_wTx = wTx[sorted_wTx_index]
 
+        # ignore warning for 0/0 when calculating F-measures
+        prev_settings = np.seterr("ignore")
+
         tp = np.sum(y[val_idx] == 1)
         fp = val_idx.size - tp
         fn = 0
+        best_obj = micro_plus_macro(tp, fp, fn)
+        best_tp, best_fp, best_fn = tp, fp, fn
         cut = -1
-        best_F = 2 * tp / (2 * tp + fp + fn)
+
         y_val = y[val_idx]
 
-        # following MATLAB implementation to suppress NaNs
-        prev_settings = np.seterr("ignore")
         for i in range(val_idx.size):
             if y_val[sorted_wTx_index[i]] == -1:
                 fp -= 1
@@ -278,32 +260,33 @@ def _scutfbr(y: np.ndarray, x: sparse.csr_matrix, fbr_list: list[float], options
                 tp -= 1
                 fn += 1
 
-            # There will be NaNs, but the behaviour is correct
-            F = 2 * tp / (2 * tp + fp + fn)
+            obj = micro_plus_macro(tp, fp, fn)
 
-            if F >= best_F:
-                best_F = F
+            if obj >= best_obj:
+                best_obj = obj
+                best_tp, best_fp, best_fn = tp, fp, fn
                 cut = i
         np.seterr(**prev_settings)
 
-        if best_F > start_F:
-            if cut == -1:  # i.e. all 1 in scut
-                scut_b = np.nextafter(-sorted_wTx[0], np.inf)  # predict all 1
-            elif cut == val_idx.size - 1:
-                scut_b = np.nextafter(-sorted_wTx[-1], np.inf)
-            else:
-                scut_b = -(sorted_wTx[cut] + sorted_wTx[cut + 1]) / 2
+        if cut == -1:  # i.e. all 1 in scut
+            thresholds[fold] = np.nextafter(sorted_wTx[0], -np.inf)  # predict all 1
+        elif cut == val_idx.size - 1:
+            thresholds[fold] = np.nextafter(sorted_wTx[-1], np.inf)
+        else:
+            thresholds[fold] = (sorted_wTx[cut] + sorted_wTx[cut + 1]) / 2
 
-        F = _fmeasure(y_val, 2 * (wTx > -scut_b) - 1)
+        tp_sum += best_tp
+        fp_sum += best_fp
+        fn_sum += best_fn
 
-        for i in range(fbr_list.size):
-            if F > fbr_list[i]:
-                b_list[i] += scut_b
-            else:
-                b_list[i] -= np.max(wTx)
+    # In FlatModel.predict_values, the threshold is added to the decision value.
+    # Therefore, we need to make it negative here.
+    threshold = -thresholds.mean()
+    stats["tp"] += tp_sum
+    stats["fp"] += fp_sum
+    stats["fn"] += fn_sum
 
-    b_list = b_list / nr_fold
-    return _do_train(y, x, options), b_list
+    return _do_train(y, x, options), threshold, stats
 
 
 def _do_train(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.matrix: