microsoft · jameslamb · Jan 18, 2021 · Jan 16, 2021 · Jan 16, 2021 · Jan 16, 2021
@@ -998,7 +998,11 @@ slice.lgb.Dataset <- function(dataset, idxset, ...) {
 #' \itemize{
 #'     \item \code{label}: label lightgbm learn from ;
 #'     \item \code{weight}: to do a weight rescale ;
-#'     \item \code{group}: group size ;
+#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
+#'         group rows together as ordered results from the same set of candidate results to be ranked.
+#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
+#'         that means that you have 6 groups, where the first 10 records are in the first group,
+#'         records 11-30 are in the second group, etc.}
 #'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
 #' }
 #'
@@ -1052,8 +1056,9 @@ getinfo.lgb.Dataset <- function(dataset, name, ...) {
 #'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
 #'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
 #'         group rows together as ordered results from the same set of candidate results to be ranked.
-#'         For example, if you have a 1000-row dataset that contains 250 4-document query results,
-#'         set this to \code{rep(4L, 250L)}}
+#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
+#'         that means that you have 6 groups, where the first 10 records are in the first group,
+#'         records 11-30 are in the second group, etc.}
 #' }
 #'
 #' @examples

@@ -762,7 +762,7 @@ Dataset Parameters
 
    -  **Note**: works only in case of loading data directly from file
 
-   -  **Note**: data should be grouped by query\_id
+   -  **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__
 
    -  **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0``
 
@@ -1231,6 +1231,7 @@ Query Data
 ~~~~~~~~~~
 
 For learning to rank, it needs query information for training data.
+
 LightGBM uses an additional file to store query data, like the following:
 
 ::
@@ -1240,7 +1241,13 @@ LightGBM uses an additional file to store query data, like the following:
     67
     ...
 
-It means first ``27`` lines samples belong to one query and next ``18`` lines belong to another, and so on.
+For wrapper libraries like in Python and R, this information can also be provided as an array-like via the Dataset parameter ``group``.
+
+::
+
+   [27, 18, 67, ...]
+
+For example, if you have a 112-document dataset with ``group = [27, 18, 67]``, that means that you have 3 groups, where the first 27 records are in the first group, records 28-45 are in the second group, and records 46-112 are in the third group.
 
 **Note**: data should be ordered by the query.
 

@@ -671,7 +671,7 @@ struct Config {
   // desc = use number for index, e.g. ``query=0`` means column\_0 is the query id
   // desc = add a prefix ``name:`` for column name, e.g. ``query=name:query_id``
   // desc = **Note**: works only in case of loading data directly from file
-  // desc = **Note**: data should be grouped by query\_id
+  // desc = **Note**: data should be grouped by query\_id, for more information, see `Query Data <#query-data>`__
   // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0``
   std::string group_column = "";
 

@@ -941,7 +941,10 @@ def __init__(self, data, label=None, reference=None,
         weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Weight for each instance.
         group : list, numpy 1-D array, pandas Series or None, optional (default=None)
-            Group/query size for Dataset.
+            Group/query data.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
         init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Init score for Dataset.
         silent : bool, optional (default=False)
@@ -1356,7 +1359,10 @@ def create_valid(self, data, label=None, weight=None, group=None,
         weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Weight for each instance.
         group : list, numpy 1-D array, pandas Series or None, optional (default=None)
-            Group/query size for Dataset.
+            Group/query data.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
         init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
             Init score for Dataset.
         silent : bool, optional (default=False)
@@ -1715,7 +1721,10 @@ def set_group(self, group):
         Parameters
         ----------
         group : list, numpy 1-D array, pandas Series or None
-            Group size of each group.
+            Group/query data.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
 
         Returns
         -------
@@ -1830,7 +1839,10 @@ def get_group(self):
         Returns
         -------
         group : numpy array or None
-            Group size of each group.
+            Group/query data.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
         """
         if self.group is None:
             self.group = self.get_field('group')

@@ -36,7 +36,10 @@ def __init__(self, func):
                 y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                     The predicted values.
                 group : array-like
-                    Group/query data, used for ranking task.
+                    Group/query data.
+                    Only used in the learning-to-rank task.
+                    sum(group) = n_samples.
+                    For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
                 grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                     The value of the first order derivative (gradient) for each sample point.
                 hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
@@ -122,7 +125,10 @@ def __init__(self, func):
                 weight : array-like of shape = [n_samples]
                     The weight of samples.
                 group : array-like
-                    Group/query data, used for ranking task.
+                    Group/query data.
+                    Only used in the learning-to-rank task.
+                    sum(group) = n_samples.
+                    For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
                 eval_name : string
                     The name of evaluation function (without whitespaces).
                 eval_result : float
@@ -266,7 +272,10 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
             y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                 The predicted values.
             group : array-like
-                Group/query data, used for ranking task.
+                Group/query data.
+                Only used in the learning-to-rank task.
+                sum(group) = n_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
             grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                 The value of the first order derivative (gradient) for each sample point.
             hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
@@ -384,7 +393,10 @@ def fit(self, X, y,
         init_score : array-like of shape = [n_samples] or None, optional (default=None)
             Init score of training data.
         group : array-like or None, optional (default=None)
-            Group data of training data.
+            Group/query data.
+            Only used in the learning-to-rank task.
+            sum(group) = n_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
         eval_set : list or None, optional (default=None)
             A list of (X, y) tuple pairs to use as validation sets.
         eval_names : list of strings or None, optional (default=None)
@@ -460,7 +472,10 @@ def fit(self, X, y,
             weight : array-like of shape = [n_samples]
                 The weight of samples.
             group : array-like
-                Group/query data, used for ranking task.
+                Group/query data.
+                Only used in the learning-to-rank task.
+                sum(group) = n_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, where the first 10 records are in the first group, records 11-30 are in the second group, etc.
             eval_name : string
                 The name of evaluation function (without whitespaces).
             eval_result : float