Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optional normalization for the ranknet loss. #11272

Merged
merged 8 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/parameter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,12 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra

Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.

* ``lambdarank_diff_normalization`` [default = ``true``]

.. versionadded:: 3.0.0

Whether to normalize the delta by prediction score difference. This can sometimes stagnate the training progress.

* ``lambdarank_unbiased`` [default = ``false``]

Specify whether do we need to debias input click data.
Expand Down
28 changes: 28 additions & 0 deletions doc/tutorials/learning_to_rank.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,34 @@ For a longer explanation, assuming the pairwise ranking method is used, we calcu

However, it's possible that a distributed framework shuffles the data during map reduce and splits every query group into multiple workers. In that case, the performance would be disastrous. As a result, it depends on the data and the framework for whether a sorted groupby is needed.

**********************************
Comparing Results with Version 1.7
**********************************

The learning to rank implementation has been significantly updated in 2.0 with added hyper-parameters and training strategies. To obtain similar result as the 1.7 :py:class:`xgboost.XGBRanker`, following parameter should be used:

.. code-block:: python

params = {
# 1.7 only supports sampling, while 2.0 and later use top-k as the default.
# See above sections for the trade-off.
"lambdarank_pair_method": "mean",
# Normalization was added in 2.0
"lambdarank_normalization": False,
# 1.7 uses the ranknet loss while later versions use the NDCG weighted loss
"objective": "rank:pairwise",
# 1.7 doesn't have this normalization.
"lambdarank_diff_normalization": False,
"base_score": 0.5,
# The default tree method has been changed from approx to hist.
"tree_method": "approx",
# The default for `mean` pair method is one pair each sample, which is the default in 1.7 as well.
# You can leave it as unset.
"lambdarank_num_pair_per_sample": 1,
}

The result still differs due to the change of random seed. But the overall training strategy would be the same for ``rank:pairwise``. Objectives including `NDCG` and `MAP` has additional normalization for the delta weight using the score difference in later versions.

*******************
Reproducible Result
*******************
Expand Down
5 changes: 5 additions & 0 deletions src/common/ranking_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
// unbiased
bool lambdarank_unbiased{false};
bool lambdarank_normalization{true};
bool lambdarank_diff_normalization{true};
double lambdarank_bias_norm{1.0};
// ndcg
bool ndcg_exp_gain{true};
Expand All @@ -88,6 +89,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample &&
lambdarank_unbiased == that.lambdarank_unbiased &&
lambdarank_normalization == that.lambdarank_normalization &&
lambdarank_diff_normalization == that.lambdarank_diff_normalization &&
lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain;
}
bool operator!=(LambdaRankParam const& that) const { return !(*this == that); }
Expand Down Expand Up @@ -139,6 +141,9 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
DMLC_DECLARE_FIELD(lambdarank_normalization)
.set_default(true)
.describe("Whether to normalize the leaf value for lambda rank.");
DMLC_DECLARE_FIELD(lambdarank_diff_normalization)
.set_default(true)
.describe("Whether to normalize the delta by prediction score difference.");
DMLC_DECLARE_FIELD(lambdarank_bias_norm)
.set_default(1.0)
.set_lower_bound(0.0)
Expand Down
130 changes: 81 additions & 49 deletions src/objective/lambdarank_obj.cc
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
/**
* Copyright 2023-2024, XGBoost contributors
* Copyright 2023-2025, XGBoost contributors
*/
#include "lambdarank_obj.h"

#include <dmlc/registry.h> // for DMLC_REGISTRY_FILE_TAG

#include <algorithm> // for transform, copy, fill_n, min, max
#include <cmath> // for pow, log2
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <map> // for operator!=
#include <memory> // for shared_ptr, __shared_ptr_access, allocator
#include <ostream> // for operator<<, basic_ostream
#include <string> // for char_traits, operator<, basic_string, string
#include <tuple> // for apply, make_tuple
#include <type_traits> // for is_floating_point
#include <utility> // for pair, swap
#include <vector> // for vector

#include "../common/error_msg.h" // for GroupWeight, LabelScoreSize
#include "../common/linalg_op.h" // for begin, cbegin, cend
#include "../common/optional_weight.h" // for MakeOptionalWeights, OptionalWeights
#include "../common/ranking_utils.h" // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
#include "../common/threading_utils.h" // for ParallelFor, Sched
#include "init_estimation.h" // for FitIntercept
#include "xgboost/base.h" // for bst_group_t, GradientPair, kRtEps, GradientPai...
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Json, get, Value, ToJson, F32Array, FromJson, IsA
#include "xgboost/linalg.h" // for Vector, Range, TensorView, VectorView, All
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_LE, CHE...
#include "xgboost/objective.h" // for ObjFunctionReg, XGBOOST_REGISTER_OBJECTIVE
#include "xgboost/span.h" // for Span, operator!=
#include "xgboost/string_view.h" // for operator<<, StringView
#include "xgboost/task.h" // for ObjInfo
#include <dmlc/registry.h> // for DMLC_REGISTRY_FILE_TAG

#include <algorithm> // for transform, copy, fill_n, min, max
#include <cmath> // for pow, log2
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <map> // for operator!=
#include <memory> // for shared_ptr, __shared_ptr_access, allocator
#include <ostream> // for operator<<, basic_ostream
#include <string> // for char_traits, operator<, basic_string, string
#include <tuple> // for apply, make_tuple
#include <type_traits> // for is_floating_point
#include <utility> // for pair, swap
#include <vector> // for vector

#include "../common/error_msg.h" // for GroupWeight, LabelScoreSize
#include "../common/linalg_op.h" // for begin, cbegin, cend
#include "../common/optional_weight.h" // for MakeOptionalWeights, OptionalWeights
#include "../common/ranking_utils.h" // for RankingCache, LambdaRankParam, MAPCache, NDCGC...
#include "../common/threading_utils.h" // for ParallelFor, Sched
#include "init_estimation.h" // for FitIntercept
#include "xgboost/base.h" // for bst_group_t, GradientPair, kRtEps, GradientPai...
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for MetaInfo
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Json, get, Value, ToJson, F32Array, FromJson, IsA
#include "xgboost/linalg.h" // for Vector, Range, TensorView, VectorView, All
#include "xgboost/logging.h" // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_LE, CHE...
#include "xgboost/objective.h" // for ObjFunctionReg, XGBOOST_REGISTER_OBJECTIVE
#include "xgboost/span.h" // for Span, operator!=
#include "xgboost/string_view.h" // for operator<<, StringView
#include "xgboost/task.h" // for ObjInfo

namespace xgboost::obj {
namespace cpu_impl {
Expand Down Expand Up @@ -115,9 +115,8 @@ class LambdaRankObj : public FitIntercept {
// This function doesn't have sycl-specific implementation yet.
// For that reason we transfer data to host in case of sycl is used for propper execution.
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(device),
lj_full_.View(device), &ti_plus_, &tj_minus_,
&li_, &lj_, p_cache_);
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(device), lj_full_.View(device),
&ti_plus_, &tj_minus_, &li_, &lj_, p_cache_);
}

li_full_.Data()->Fill(0.0);
Expand Down Expand Up @@ -163,7 +162,7 @@ class LambdaRankObj : public FitIntercept {
}

// Calculate lambda gradient for each group on CPU.
template <bool unbiased, typename Delta>
template <bool unbiased, bool norm_by_diff, typename Delta>
void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
linalg::VectorView<float const> g_label, float w,
common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
Expand All @@ -180,7 +179,9 @@ class LambdaRankObj : public FitIntercept {
// https://github.com/microsoft/LightGBM/pull/2331#issuecomment-523259298
double sum_lambda{0.0};

auto delta_op = [&](auto const&... args) { return delta(args..., g); };
auto delta_op = [&](auto const&... args) {
return delta(args..., g);
};

auto loop = [&](std::size_t i, std::size_t j) {
// higher/lower on the target ranked list
Expand All @@ -193,8 +194,8 @@ class LambdaRankObj : public FitIntercept {
}

double cost;
auto pg = LambdaGrad<unbiased>(g_label, g_predt, g_rank, rank_high, rank_low, delta_op,
ti_plus, tj_minus, &cost);
auto pg = LambdaGrad<unbiased, norm_by_diff>(g_label, g_predt, g_rank, rank_high, rank_low,
delta_op, ti_plus, tj_minus, &cost);
auto ng = Repulse(pg);

std::size_t idx_high = g_rank[rank_high];
Expand Down Expand Up @@ -349,7 +350,14 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
static_assert(std::is_floating_point_v<decltype(y_high)>);
return DeltaNDCG<exp_gain>(y_high, y_low, rank_high, rank_low, inv_IDCG(g), discount);
};
this->CalcLambdaForGroup<unbiased>(iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);

if (this->param_.lambdarank_diff_normalization) {
this->CalcLambdaForGroup<unbiased, true>(iter, g_predt, g_label, w, g_rank, g, delta,
g_gpair);
} else {
this->CalcLambdaForGroup<unbiased, true>(iter, g_predt, g_label, w, g_rank, g, delta,
g_gpair);
}
}

void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
Expand All @@ -372,7 +380,9 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
auto h_predt = predt.ConstHostSpan();
auto h_label = info.labels.HostView();
auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
auto make_range = [&](bst_group_t g) {
return linalg::Range(gptr[g], gptr[g + 1]);
};

auto dct = GetCache()->Discount(ctx_);
auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
Expand Down Expand Up @@ -496,7 +506,9 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);

auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
auto make_range = [&](bst_group_t g) {
return linalg::Range(gptr[g], gptr[g + 1]);
};

cpu_impl::MAPStat(ctx_, h_label, rank_idx, GetCache());
auto n_rel = GetCache()->NumRelevant(ctx_);
Expand Down Expand Up @@ -528,9 +540,17 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta_map, g_gpair);

if (param_.lambdarank_unbiased) {
std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, D>, args);
if (this->param_.lambdarank_diff_normalization) {
std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, true, D>, args);
} else {
std::apply(&LambdaRankMAP::CalcLambdaForGroup<true, false, D>, args);
}
} else {
std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, D>, args);
if (this->param_.lambdarank_diff_normalization) {
std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, true, D>, args);
} else {
std::apply(&LambdaRankMAP::CalcLambdaForGroup<false, false, D>, args);
}
}
});
}
Expand Down Expand Up @@ -583,10 +603,14 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
auto h_predt = predt.ConstHostSpan();
auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);

auto make_range = [&](bst_group_t g) { return linalg::Range(gptr[g], gptr[g + 1]); };
auto make_range = [&](bst_group_t g) {
return linalg::Range(gptr[g], gptr[g + 1]);
};
auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);

auto delta = [](auto...) { return 1.0; };
auto delta = [](auto...) {
return 1.0;
};
using D = decltype(delta);

common::ParallelFor(n_groups, ctx_->Threads(), [&](auto g) {
Expand All @@ -599,9 +623,17 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking

auto args = std::make_tuple(this, iter, g_predt, g_label, w, g_rank, g, delta, g_gpair);
if (param_.lambdarank_unbiased) {
std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, D>, args);
if (this->param_.lambdarank_diff_normalization) {
std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, true, D>, args);
} else {
std::apply(&LambdaRankPairwise::CalcLambdaForGroup<true, false, D>, args);
}
} else {
std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, D>, args);
if (this->param_.lambdarank_diff_normalization) {
std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, true, D>, args);
} else {
std::apply(&LambdaRankPairwise::CalcLambdaForGroup<false, false, D>, args);
}
}
});
}
Expand Down
Loading
Loading