Skip to content

Commit

Permalink
enable filtering of categories by minimum number of samples
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Dec 30, 2024
1 parent 6aea545 commit 8e22d6b
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 16 deletions.
96 changes: 82 additions & 14 deletions shared/libebm/PartitionOneDimensionalBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
EBM_ASSERT(nullptr != pTotalGain);
EBM_ASSERT(!bNominal || MONOTONE_NONE == monotoneDirection);

ErrorEbm error;

// TODO: use all of these!
UNUSED(bUnseen);
UNUSED(cCategorySamplesMin);
Expand Down Expand Up @@ -1010,15 +1012,15 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
pBinsEnd);

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>** ppBin = apBins;
const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pBin = aBins;
Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pBin = aBins;

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pMissingBin = nullptr;
bool bMissingIsolated = false;

const TreeNode<bHessian, GetArrayScores(cCompilerScores)>* pMissingValueTreeNode = nullptr;
const TreeNode<bHessian, GetArrayScores(cCompilerScores)>* pDregsTreeNode = nullptr;

const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pDregSumBin = nullptr;
Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* pDregSumBin = nullptr;

const auto* aSumBins = aBins;
if(bMissing) {
Expand Down Expand Up @@ -1056,21 +1058,87 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
pBoosterShell, aSumBins, pBinsEnd, cSamplesTotal, weightTotal, pRootTreeNode->GetBin());

do {
*ppBin = pBin;
if(bNominal && pBin->GetCountSamples() < cCategorySamplesMin) {
if(pDregSumBin) {
pDregSumBin->Add(cScores, *pBin, pBin->GetGradientPairs());
} else {
pDregSumBin = pBin;
pDregsTreeNode = pRootTreeNode;
}
} else {
*ppBin = pBin;
++ppBin;
}
pBin = IndexBin(pBin, cBytesPerBin);
++ppBin;
} while(pBinsEnd != pBin);

if(bMissing && !bNominal && (TermBoostFlags_MissingHigh & flags)) {
*ppBin = aBins;
++ppBin;
}

if(bNominal) {
std::sort(apBins,
ppBin,
CompareBin<bHessian, cCompilerScores>(
!(TermBoostFlags_DisableNewtonUpdate & flags), categoricalSmoothing));
if(apBins == ppBin) {
// all categories are dregs, so pretend there's just one bin and everything is inside it

const bool bUpdateWithHessian = bHessian && !(TermBoostFlags_DisableNewtonUpdate & flags);

Tensor* const pInnerTermUpdate = pBoosterShell->GetInnerTermUpdate();

error = pInnerTermUpdate->SetCountSlices(iDimension, cBins);
if(UNLIKELY(Error_None != error)) {
// already logged
return error;
}

EBM_ASSERT(!IsMultiplyError(cScores, cBins));
error = pInnerTermUpdate->EnsureTensorScoreCapacity(cScores * cBins);
if(UNLIKELY(Error_None != error)) {
// already logged
return error;
}

UIntSplit* pSplit = pInnerTermUpdate->GetSplitPointer(iDimension);

UIntSplit iSplit = 1;
while(cBins != iSplit) {
pSplit[iSplit - 1] = iSplit;
++iSplit;
}

FloatScore* const aUpdateScore = pInnerTermUpdate->GetTensorScoresPointer();

size_t iBin = 0;
do {
// This bin is missing from the tree nodes, so it's a dreg
FloatScore* pUpdateScoreDregs = aUpdateScore + cScores * iBin;

FloatScore hess = static_cast<FloatCalc>(pRootTreeNode->GetBin()->GetWeight());
const auto* pGradientPair = pRootTreeNode->GetBin()->GetGradientPairs();
const auto* const pGradientPairEnd = pGradientPair + cScores;
do {
if(bUpdateWithHessian) {
hess = static_cast<FloatCalc>(pGradientPair->GetHess());
}
FloatCalc updateScore = -CalcNegUpdate<true>(
static_cast<FloatCalc>(pGradientPair->m_sumGradients), hess, regAlpha, regLambda, deltaStepMax);

*pUpdateScoreDregs = static_cast<FloatScore>(updateScore);
++pUpdateScoreDregs;

++pGradientPair;
} while(pGradientPairEnd != pGradientPair);
++iBin;
} while(cBins != iBin);

*pTotalGain = 0;
return error;
} else {
std::sort(apBins,
ppBin,
CompareBin<bHessian, cCompilerScores>(
!(TermBoostFlags_DisableNewtonUpdate & flags), categoricalSmoothing));
}
} else {
if(bMissing && (TermBoostFlags_MissingHigh & flags)) {
*ppBin = aBins;
++ppBin;
}
}

pRootTreeNode->BEFORE_SetBinFirst(apBins);
Expand Down Expand Up @@ -1246,7 +1314,7 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
if(bNominal) {
cSlices = cBins;
}
const ErrorEbm error = Flatten<bHessian>(pBoosterShell,
error = Flatten<bHessian>(pBoosterShell,
bMissing,
bNominal,
flags,
Expand Down
4 changes: 2 additions & 2 deletions shared/libebm/tests/boosting_unusual_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2224,7 +2224,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
const double regAlpha = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
const double regLambda = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
const double maxDeltaStep = 0 == TestRand(rng, 5) ? 1.0 : 0.0;
const IntEbm minCategorySamples = 0; // TODO: make random
const IntEbm minCategorySamples = TestRand(rng, 100);
const double minCategoryHessianPercent = 0.0; // TODO: make random
const double categoricalSmoothing = 10.0;
const IntEbm maxCategoricalThreshold = 1 + TestRand(rng, cRealBins + 1);
Expand Down Expand Up @@ -2268,7 +2268,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
}

TEST_CASE("stress test, boosting") {
const double expected = 15052328055998.955;
const double expected = 17508883449920.195;

double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE);
CHECK(validationMetricExact == expected);
Expand Down

0 comments on commit 8e22d6b

Please sign in to comment.