dmlc · trivialfis · Feb 25, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 24, 2025
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
@@ -769,6 +769,21 @@ xgb.train <- function(params = xgb.params(), data, nrounds, evals = list(),
 #' Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
 #'
 #' Version added: 2.1.0
+#'
+#' @param lambdarank_score_normalization
+#'
+#' Whether to normalize the delta metric by the difference of prediction scores. This can
+#' sometimes stagnate the training progress. With pairwise ranking, we can normalize the
+#' gradient using the difference between two samples in each pair to reduce influence from
+#' the pairs that have large difference in ranking scores. This can help us regularize the
+#' model to reduce bias and prevent overfitting. Similar to other regularization
+#' techniques, this might prevent training from converging.
+#'
+#' There was no normalization before 2.0. In 2.0 and later versions this is used by
+#' default. In 3.0, we made this an option that users can disable.
+#'
+#' Version added: 3.0.0
+#'
 #' @param lambdarank_unbiased (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = `FALSE`)
 #' Specify whether do we need to debias input click data.
 #' @param lambdarank_bias_norm (for learning to rank (`"rank:ndcg"`, `"rank:map"`, `"rank:pairwise"`)) (default = 2.0)
@@ -833,6 +848,7 @@ xgb.params <- function(
   lambdarank_pair_method = NULL,
   lambdarank_num_pair_per_sample = NULL,
   lambdarank_normalization = NULL,
+  lambdarank_score_normalization = NULL,
   lambdarank_unbiased = NULL,
   lambdarank_bias_norm = NULL,
   ndcg_exp_gain = NULL

diff --git a/R-package/man/xgb.params.Rd b/R-package/man/xgb.params.Rd
diff --git a/doc/parameter.rst b/doc/parameter.rst
@@ -540,6 +540,20 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
   Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
 
+* ``lambdarank_score_normalization`` [default = ``true``]
+
+  .. versionadded:: 3.0.0
+
+  Whether to normalize the delta metric by the difference of prediction scores. This can
+  sometimes stagnate the training progress. With pairwise ranking, we can normalize the
+  gradient using the difference between two samples in each pair to reduce influence from
+  the pairs that have large difference in ranking scores. This can help us regularize the
+  model to reduce bias and prevent overfitting. Similar to other regularization
+  techniques, this might prevent training from converging.
+
+  There was no normalization before 2.0. In 2.0 and later versions this is used by
+  default. In 3.0, we made this an option that users can disable.
+
 *  ``lambdarank_unbiased`` [default = ``false``]
 
   Specify whether do we need to debias input click data.

diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
@@ -186,6 +186,34 @@ For a longer explanation, assuming the pairwise ranking method is used, we calcu
 
 However, it's possible that a distributed framework shuffles the data during map reduce and splits every query group into multiple workers. In that case, the performance would be disastrous. As a result, it depends on the data and the framework for whether a sorted groupby is needed.
 
+**********************************
+Comparing Results with Version 1.7
+**********************************
+
+The learning to rank implementation has been significantly updated in 2.0 with added hyper-parameters and training strategies. To obtain similar result as the 1.7 :py:class:`xgboost.XGBRanker`, following parameter should be used:
+
+.. code-block:: python
+
+    params = {
+        # 1.7 only supports sampling, while 2.0 and later use top-k as the default.
+	# See above sections for the trade-off.
+        "lambdarank_pair_method": "mean",
+        # Normalization was added in 2.0
+        "lambdarank_normalization": False,
+        # 1.7 uses the ranknet loss while later versions use the NDCG weighted loss
+        "objective": "rank:pairwise",
+	# 1.7 doesn't have this normalization.
+	"lambdarank_score_normalization": False,
+        "base_score": 0.5,
+        # The default tree method has been changed from approx to hist.
+        "tree_method": "approx",
+        # The default for `mean` pair method is one pair each sample, which is the default in 1.7 as well.
+        # You can leave it as unset.
+        "lambdarank_num_pair_per_sample": 1,
+    }
+
+The result still differs due to the change of random seed. But the overall training strategy would be the same for ``rank:pairwise``.
+
 *******************
 Reproducible Result
 *******************

diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
@@ -118,3 +118,30 @@ def run_normalization(device: str) -> None:
     ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
     e1 = ltr.evals_result()
     assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]
+
+
+def run_score_normalization(device: str, objective: str) -> None:
+    """Test normalization by score differences."""
+    if objective == "rank:map":
+        # Binary relevance
+        X, y, qid, _ = tm.make_ltr(4096, 4, 64, max_rel=1)
+    else:
+        X, y, qid, _ = tm.make_ltr(4096, 4, 64, 3)
+    ltr = xgb.XGBRanker(objective=objective, n_estimators=4, device=device)
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_score_normalization=False,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+
+    m0, m1 = (
+        list(e0["validation_0"].values())[-1][-1],
+        list(e1["validation_0"].values())[-1][-1],
+    )
+    assert m0 != m1
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
@@ -79,6 +79,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
   // unbiased
   bool lambdarank_unbiased{false};
   bool lambdarank_normalization{true};
+  bool lambdarank_score_normalization{true};
   double lambdarank_bias_norm{1.0};
   // ndcg
   bool ndcg_exp_gain{true};
@@ -88,6 +89,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
            lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample &&
            lambdarank_unbiased == that.lambdarank_unbiased &&
            lambdarank_normalization == that.lambdarank_normalization &&
+           lambdarank_score_normalization == that.lambdarank_score_normalization &&
            lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain;
   }
   bool operator!=(LambdaRankParam const& that) const { return !(*this == that); }
@@ -139,6 +141,9 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
     DMLC_DECLARE_FIELD(lambdarank_normalization)
         .set_default(true)
         .describe("Whether to normalize the leaf value for lambda rank.");
+    DMLC_DECLARE_FIELD(lambdarank_score_normalization)
+        .set_default(true)
+        .describe("Whether to normalize the delta by prediction score difference.");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
         .set_default(1.0)
         .set_lower_bound(0.0)

diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
@@ -1,40 +1,40 @@
 /**
- * Copyright 2023-2024, XGBoost contributors
+ * Copyright 2023-2025, XGBoost contributors
  */
 #include "lambdarank_obj.h"
 
-#include <dmlc/registry.h>                 // for DMLC_REGISTRY_FILE_TAG
-
-#include <algorithm>                       // for transform, copy, fill_n, min, max
-#include <cmath>                           // for pow, log2
-#include <cstddef>                         // for size_t
-#include <cstdint>                         // for int32_t
-#include <map>                             // for operator!=
-#include <memory>                          // for shared_ptr, __shared_ptr_access, allocator
-#include <ostream>                         // for operator<<, basic_ostream
-#include <string>                          // for char_traits, operator<, basic_string, string
-#include <tuple>                           // for apply, make_tuple
-#include <type_traits>                     // for is_floating_point
-#include <utility>                         // for pair, swap
-#include <vector>                          // for vector
-
-#include "../common/error_msg.h"           // for GroupWeight, LabelScoreSize
-#include "../common/linalg_op.h"           // for begin, cbegin, cend
-#include "../common/optional_weight.h"     // for MakeOptionalWeights, OptionalWeights
-#include "../common/ranking_utils.h"       // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
-#include "../common/threading_utils.h"     // for ParallelFor, Sched
-#include "init_estimation.h"               // for FitIntercept
-#include "xgboost/base.h"                  // for bst_group_t, GradientPair, kRtEps, GradientPai...
-#include "xgboost/context.h"               // for Context
-#include "xgboost/data.h"                  // for MetaInfo
-#include "xgboost/host_device_vector.h"    // for HostDeviceVector
-#include "xgboost/json.h"                  // for Json, get, Value, ToJson, F32Array, FromJson, IsA
-#include "xgboost/linalg.h"                // for Vector, Range, TensorView, VectorView, All
-#include "xgboost/logging.h"               // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_LE, CHE...
-#include "xgboost/objective.h"             // for ObjFunctionReg, XGBOOST_REGISTER_OBJECTIVE
-#include "xgboost/span.h"                  // for Span, operator!=
-#include "xgboost/string_view.h"           // for operator<<, StringView
-#include "xgboost/task.h"                  // for ObjInfo
+#include <dmlc/registry.h>  // for DMLC_REGISTRY_FILE_TAG
+
+#include <algorithm>    // for transform, copy, fill_n, min, max
+#include <cmath>        // for pow, log2
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <map>          // for operator!=
+#include <memory>       // for shared_ptr, __shared_ptr_access, allocator
+#include <ostream>      // for operator<<, basic_ostream
+#include <string>       // for char_traits, operator<, basic_string, string
+#include <tuple>        // for apply, make_tuple
+#include <type_traits>  // for is_floating_point
+#include <utility>      // for pair, swap
+#include <vector>       // for vector
+
+#include "../common/error_msg.h"         // for GroupWeight, LabelScoreSize
+#include "../common/linalg_op.h"         // for begin, cbegin, cend
+#include "../common/optional_weight.h"   // for MakeOptionalWeights, OptionalWeights
+#include "../common/ranking_utils.h"     // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
+#include "../common/threading_utils.h"   // for ParallelFor, Sched
+#include "init_estimation.h"             // for FitIntercept
+#include "xgboost/base.h"                // for bst_group_t, GradientPair, kRtEps, GradientPai...
+#include "xgboost/context.h"             // for Context
+#include "xgboost/data.h"                // for MetaInfo
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+#include "xgboost/json.h"                // for Json, get, Value, ToJson, F32Array, FromJson, IsA
+#include "xgboost/linalg.h"              // for Vector, Range, TensorView, VectorView, All
+#include "xgboost/logging.h"             // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_LE, CHE...
+#include "xgboost/objective.h"           // for ObjFunctionReg, XGBOOST_REGISTER_OBJECTIVE
+#include "xgboost/span.h"                // for Span, operator!=
+#include "xgboost/string_view.h"         // for operator<<, StringView
+#include "xgboost/task.h"                // for ObjInfo
 
 namespace xgboost::obj {
 namespace cpu_impl {
@@ -115,9 +115,8 @@ class LambdaRankObj : public FitIntercept {
       // This function doesn't have sycl-specific implementation yet.
       // For that reason we transfer data to host in case of sycl is used for propper execution.
       auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
-      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(device),
-                                             lj_full_.View(device), &ti_plus_, &tj_minus_,
-                                             &li_, &lj_, p_cache_);
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(device), lj_full_.View(device),
+                                             &ti_plus_, &tj_minus_, &li_, &lj_, p_cache_);
     }
 
     li_full_.Data()->Fill(0.0);
@@ -163,7 +162,7 @@ class LambdaRankObj : public FitIntercept {
   }
 
   // Calculate lambda gradient for each group on CPU.
-  template <bool unbiased, typename Delta>
+  template <bool unbiased, bool norm_by_diff, typename Delta>
   void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
                           linalg::VectorView<float const> g_label, float w,
                           common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
@@ -180,7 +179,9 @@ class LambdaRankObj : public FitIntercept {
     // https://github.com/microsoft/LightGBM/pull/2331#issuecomment-523259298
     double sum_lambda{0.0};
 
-    auto delta_op = [&](auto const&... args) { return delta(args..., g); };
+    auto delta_op = [&](auto const&... args) {
+      return delta(args..., g);
+    };
 
     auto loop = [&](std::size_t i, std::size_t j) {
       // higher/lower on the target ranked list
@@ -193,8 +194,8 @@ class LambdaRankObj : public FitIntercept {
       }
 
       double cost;
-      auto pg = LambdaGrad<unbiased>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
-                                     ti_plus, tj_minus, &cost);
+      auto pg = LambdaGrad<unbiased, norm_by_diff>(g_label, g_predt, g_rank, rank_high, rank_low,
+                                                   delta_op, ti_plus, tj_minus, &cost);
       auto ng = Repulse(pg);
 
       std::size_t idx_high = g_rank[rank_high];
@@ -349,7 +350,14 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
       static_assert(std::is_floating_point_v<decltype(y_high)>);
       return DeltaNDCG<exp_gain>(y_high, y_low, rank_high, rank_low, inv_IDCG(g), discount);
     };
-    this->CalcLambdaForGroup<unbiased>(iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
+
+    if (this->param_.lambdarank_score_normalization) {
+      this->CalcLambdaForGroup<unbiased, true>(iter, g_predt, g_label, w, g_rank, g, delta,
+                                               g_gpair);
+    } else {
+      this->CalcLambdaForGroup<unbiased, false>(iter, g_predt, g_label, w, g_rank, g, delta,
+                                                g_gpair);
+    }
   }
 
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
@@ -372,7 +380,9 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
     auto h_predt = predt.ConstHostSpan();
     auto h_label = info.labels.HostView();
     auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
-    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+    auto make_range = [&](bst_group_t g) {
+      return linalg::Range(gptr[g], gptr[g + 1]);
+    };
 
     auto dct = GetCache()->Discount(ctx_);
     auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
@@ -496,7 +506,9 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
     auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
     auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
 
-    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+    auto make_range = [&](bst_group_t g) {
+      return linalg::Range(gptr[g], gptr[g + 1]);
+    };
 
     cpu_impl::MAPStat(ctx_, h_label, rank_idx, GetCache());
     auto n_rel = GetCache()->NumRelevant(ctx_);
@@ -528,9 +540,17 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
       auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta_map, g_gpair);
 
       if (param_.lambdarank_unbiased) {
-        std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, D>, args);
+        if (this->param_.lambdarank_score_normalization) {
+          std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, true, D>, args);
+        } else {
+          std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, false, D>, args);
+        }
       } else {
-        std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, D>, args);
+        if (this->param_.lambdarank_score_normalization) {
+          std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, true, D>, args);
+        } else {
+          std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, false, D>, args);
+        }
       }
     });
   }
@@ -583,10 +603,14 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
     auto h_predt = predt.ConstHostSpan();
     auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
 
-    auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
+    auto make_range = [&](bst_group_t g) {
+      return linalg::Range(gptr[g], gptr[g + 1]);
+    };
     auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
 
-    auto delta = [](auto...) { return 1.0; };
+    auto delta = [](auto...) {
+      return 1.0;
+    };
     using D = decltype(delta);
 
     common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
@@ -599,9 +623,17 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
 
       auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
       if (param_.lambdarank_unbiased) {
-        std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, D>, args);
+        if (this->param_.lambdarank_score_normalization) {
+          std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, true, D>, args);
+        } else {
+          std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, false, D>, args);
+        }
       } else {
-        std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, D>, args);
+        if (this->param_.lambdarank_score_normalization) {
+          std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, true, D>, args);
+        } else {
+          std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, false, D>, args);
+        }
       }
     });
   }