Skip to content

Commit

Permalink
[ML] Trap out-of-bounds read initialising boosted tree training (#709)
Browse files Browse the repository at this point in the history
  • Loading branch information
tveasey authored Oct 1, 2019
1 parent 8a7f15c commit 55b9f3e
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
5 changes: 3 additions & 2 deletions lib/maths/CBoostedTreeImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -414,8 +414,9 @@ CBoostedTreeImpl::gainAndCurvatureAtPercentile(double percentile,
return {0.0, 0.0};
}

std::size_t index{static_cast<std::size_t>(
percentile * static_cast<double>(gains.size()) / 100.0 + 0.5)};
std::size_t index{std::min(
static_cast<std::size_t>(percentile * static_cast<double>(gains.size()) / 100.0 + 0.5),
gains.size() - 1)};
std::nth_element(gains.begin(), gains.begin() + index, gains.end());
std::nth_element(curvatures.begin(), curvatures.begin() + index, curvatures.end());

Expand Down
46 changes: 46 additions & 0 deletions lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,50 @@ void CBoostedTreeTest::testIntegerRegressor() {
CPPUNIT_ASSERT(modelRSquared > 0.99);
}

void CBoostedTreeTest::testSingleSplit() {

// We were getting an out-of-bound read in initialization when running on
// data with only two distinct values for the target variable. This test
// fails intermittently without the fix.

test::CRandomNumbers rng;

std::size_t rows{100};
std::size_t cols{2};

TDoubleVec x;
rng.generateUniformSamples(0.0, 1.0, rows, x);
for (auto& xi : x) {
xi = std::floor(xi + 0.5);
}

auto frame = core::makeMainStorageDataFrame(cols).first;
frame->categoricalColumns({false, false});
for (std::size_t i = 0; i < rows; ++i) {
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
*(column++) = x[i];
*column = 10.0 * x[i];
});
}
frame->finishWritingRows();

auto regression = maths::CBoostedTreeFactory::constructFromParameters(
1, std::make_unique<maths::boosted_tree::CMse>())
.buildFor(*frame, cols - 1);

regression->train();

double modelBias;
double modelRSquared;
std::tie(modelBias, modelRSquared) = computeEvaluationMetrics(
*frame, 0, rows, regression->columnHoldingPrediction(frame->numberColumns()),
[](const TRowRef& row) { return 10.0 * row[0]; }, 0.0);

LOG_DEBUG(<< "bias = " << modelBias);
LOG_DEBUG(<< " R^2 = " << modelRSquared);
CPPUNIT_ASSERT(modelRSquared > 0.99);
}

void CBoostedTreeTest::testTranslationInvariance() {

// We should get similar performance independent of fixed shifts for the target.
Expand Down Expand Up @@ -1070,6 +1114,8 @@ CppUnit::Test* CBoostedTreeTest::suite() {
&CBoostedTreeTest::testCategoricalRegressors));
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
"CBoostedTreeTest::testIntegerRegressor", &CBoostedTreeTest::testIntegerRegressor));
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
"CBoostedTreeTest::testSingleSplit", &CBoostedTreeTest::testSingleSplit));
suiteOfTests->addTest(new CppUnit::TestCaller<CBoostedTreeTest>(
"CBoostedTreeTest::testTranslationInvariance",
&CBoostedTreeTest::testTranslationInvariance));
Expand Down
1 change: 1 addition & 0 deletions lib/maths/unittest/CBoostedTreeTest.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class CBoostedTreeTest : public CppUnit::TestFixture {
void testConstantTarget();
void testCategoricalRegressors();
void testIntegerRegressor();
void testSingleSplit();
void testTranslationInvariance();
void testDepthBasedRegularization();
void testEstimateMemoryUsedByTrain();
Expand Down

0 comments on commit 55b9f3e

Please sign in to comment.