Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide quantiles and average binning for RF histograms #2309

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,18 @@ enum SplitterMode
random /* Calculates best split from aggregate random feature splits for every node. */
};

/**
* <a name = "DAAL-ENUM-ALGORITHMS__DECISION_FOREST__TRAINING__BINNINGSTRATEGY"></a>
* \brief Available strategies to compute data bins in 'hist' method
*/
enum BinningStrategy
{
/* Frequency quantiles -> same number of data points per bin */
quantiles,
/* Same feature value range per bin */
averages
};

/**
* \brief Contains version 2.0 of the Intel(R) oneAPI Data Analytics Library interface
*/
Expand Down Expand Up @@ -145,6 +157,9 @@ class DAAL_EXPORT Parameter
size_t minBinSize; /*!< Used with 'hist' split finding method only.
Minimal number of observations in a bin. Default is 5 */
SplitterMode splitter; /*!< Sets node splitting method. Default is best */
BinningStrategy binningStrategy; /*!< Used with 'hist' split finding method only.
Selects the strategy to group data points into bins.
Allowed values are 'quantiles' (default), 'averages' */
};
/* [Parameter source code] */
} // namespace interface2
Expand Down
18 changes: 14 additions & 4 deletions cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#ifndef __DTREES_FEATURE_TYPE_HELPER_H__
#define __DTREES_FEATURE_TYPE_HELPER_H__

#include "include/algorithms/decision_forest/decision_forest_training_parameter.h"
#include "src/externals/service_memory.h"
#include "src/data_management/service_numeric_table.h"

Expand Down Expand Up @@ -69,12 +70,20 @@ class FeatureTypes
int _lastUnordered = -1;
};

using daal::algorithms::decision_forest::training::BinningStrategy;

struct BinParams
{
BinParams(size_t _maxBins, size_t _minBinSize) : maxBins(_maxBins), minBinSize(_minBinSize) {}
BinParams(const BinParams & o) : maxBins(o.maxBins), minBinSize(o.minBinSize) {}

size_t maxBins = 256;
BinParams(size_t _maxBins, size_t _minBinSize, BinningStrategy _binningStrategy = BinningStrategy::quantiles)
: maxBins(_maxBins), minBinSize(_minBinSize), binningStrategy(_binningStrategy)
{}
BinParams(const BinParams & o) : maxBins(o.maxBins), minBinSize(o.minBinSize), binningStrategy(o.binningStrategy) {}

/* Strategy to create bins for feature values. Default: quantiles */
BinningStrategy binningStrategy = BinningStrategy::quantiles;
/* Maximum number of bins for indexed data. Default: 256 */
size_t maxBins = 256;
/* Minimum bin width (number of data points per bin). Default: 5*/
size_t minBinSize = 5;
};

Expand Down Expand Up @@ -103,6 +112,7 @@ class IndexedFeatures
IndexedFeatures() : _data(nullptr), _entries(nullptr), _sizeOfIndex(sizeof(IndexType)), _nCols(0), _nRows(0), _capacity(0), _maxNumIndices(0) {}
~IndexedFeatures();

//initialize the feature indices, i.e. bins
template <typename algorithmFPType, CpuType cpu>
services::Status init(const NumericTable & nt, const FeatureTypes * featureTypes = nullptr, const BinParams * pBimPrm = nullptr);

Expand Down
193 changes: 148 additions & 45 deletions cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.i
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
// Cpu-dependent initialization of service data structure
//--
*/

#include "include/services/error_indexes.h"
#include "src/algorithms/dtrees/dtrees_feature_type_helper.h"
#include "src/threading/threading.h"
#include "src/algorithms/service_error_handling.h"
Expand Down Expand Up @@ -151,8 +153,22 @@ struct ColIndexTaskBins : public ColIndexTask<IndexType, algorithmFPType, cpu>
{
typedef ColIndexTask<IndexType, algorithmFPType, cpu> super;
ColIndexTaskBins(size_t nRows, const BinParams & prm) : super(nRows), _prm(prm), _bins(_prm.maxBins) {}

/*
* Transform features based on the BinParams _prm.
* - If no BinParams _prm are provided, one bin per unique value in the
* dataset is created
* - If BinParams _prm are provided, the strategy set according to
* BinParams::Strategy is used
*/
virtual services::Status makeIndex(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows,
bool bUnorderedFeature) DAAL_C11_OVERRIDE;
/* Function to create feature indices for Strategy == quantiles */
services::Status makeIndexQuantiles(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows);
/* Function to create feature indices for Strategy == averages */
services::Status makeIndexAverages(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows);
/* Helper to treat constant-valued features */
services::Status makeIndexConstant(IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t nRows);

private:
services::Status assignIndexAccordingToBins(IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t nBins, size_t nRows);
Expand Down Expand Up @@ -226,25 +242,37 @@ template <typename IndexType, typename algorithmFPType, CpuType cpu>
services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndex(NumericTable & nt, IndexedFeatures::FeatureEntry & entry,
IndexType * aRes, size_t iCol, size_t nRows, bool bUnorderedFeature)
{
/* feature is not ordered or fewer data points than bins -> no indexing needed */
if (bUnorderedFeature || nRows <= _prm.maxBins) return this->template makeIndexDefault<true>(nt, entry, aRes, iCol, nRows, bUnorderedFeature);

/* sort feature values */
Status s = this->getSorted(nt, iCol, nRows);
if (!s) return s;

/* special case: all values are the same -> constant-valued feature */
const typename super::FeatureIdx * index = this->_index.get();
if (index[0].key == index[nRows - 1].key)
{
_bins[0] = nRows;
services::internal::service_memset_seq<IndexType, cpu>(aRes, 0, nRows);
return makeIndexConstant(entry, aRes, nRows);
}

entry.numIndices = 1;
s |= entry.allocBorders();
DAAL_CHECK(s, s);
entry.binBorders[0] = index[nRows - 1].key;
return s;
/* Create bins of sorted data according to strategy selected in _prm */
switch (_prm.binningStrategy)
{
case dtrees::internal::BinningStrategy::quantiles: return makeIndexQuantiles(nt, entry, aRes, iCol, nRows);
case dtrees::internal::BinningStrategy::averages: return makeIndexAverages(nt, entry, aRes, iCol, nRows);
default: return Status(ErrorID::ErrorMethodNotSupported);
}
}

size_t nBins = 0;
template <typename IndexType, typename algorithmFPType, CpuType cpu>
services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndexQuantiles(NumericTable & nt, IndexedFeatures::FeatureEntry & entry,
IndexType * aRes, size_t iCol, size_t nRows)
{
const typename super::FeatureIdx * index = this->_index.get();

size_t nBins = 0;
DAAL_ASSERT(_prm.maxBins > 0);
const size_t binSize = nRows / _prm.maxBins;
int64_t remainder = nRows % _prm.maxBins; //allow for negative values
size_t dx = 2 * _prm.maxBins;
Expand All @@ -268,44 +296,59 @@ services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndex(Nu
}
size_t iRight = i + newBinSize - 1; //intersperse remainder amongst bins
const typename super::FeatureIdx & ri = index[iRight];
if (ri.key == index[iRight + 1].key)

if (ri.key != index[iRight + 1].key)
{
//right border can't be placed at iRight because it has to be between different feature values
//try moving the border to the right, find the first value bigger than the value at iRight
++iRight;
size_t r = iRight + binSize;
//at first, roughly locate the value bigger than iRight, jumping by binSize to the right
for (; (r < nRows) && (index[r].key == ri.key); r += binSize)
{}
if (r > nRows) r = nRows;
//then locate a new border as the upper_bound between this rough value and iRight
iRight = upper_bound<typename super::FeatureIdx>(index + iRight + 1, index + r, ri) - index;
//this is the size of the bin
newBinSize = iRight - i;
//if the value it is too big (number of feature values equal to ri.key is bigger than binSize)
//then perhaps left border of the bin can be moved to the right
if (newBinSize >= 2 * binSize)
// value changed from one bin to the next, append and continue
append(_bins, nBins, newBinSize);
i += newBinSize;
continue;
}

/* when arriving here, the feature value has not changed and
* we have to move iRight to the right until we find a new value
* r will be located at the first value that is different from ri.key
*/
++iRight;
size_t r = iRight + binSize;
while (r < nRows && index[r].key == ri.key)
{
r += binSize;
}
if (r > nRows)
{
r = nRows;
}
// upper_bound() returns the index of the first value change between
// index + iRight + 1 and index + r
iRight = upper_bound<typename super::FeatureIdx>(index + iRight + 1, index + r, ri) - index;
newBinSize = iRight - i;

if (newBinSize >= 2 * binSize)
{
// the new bin is too wide, try insert an additional bin to the left
size_t iClosestSmallerValue = i + binSize - 1;
while (iClosestSmallerValue > i && index[iClosestSmallerValue].key == ri.key)
{
size_t iClosestSmallerValue = i + binSize - 1;
for (; (iClosestSmallerValue > i) && (index[iClosestSmallerValue].key == ri.key); --iClosestSmallerValue)
;
size_t dist = iClosestSmallerValue - i;
if (dist > _prm.minBinSize)
{
//add an extra bin at the left
const size_t newLeftBinSize = dist + 1;
append(_bins, nBins, newLeftBinSize);
i += newLeftBinSize;
newBinSize -= newLeftBinSize;
}
else if ((nBins > 0) && dist)
{
//if it is small and not the first bin, then extend previous bin by the value
const size_t nAddToPrevBin = dist + 1;
_bins[nBins - 1] += nAddToPrevBin;
i += nAddToPrevBin;
newBinSize -= nAddToPrevBin;
}
--iClosestSmallerValue;
}
size_t dist = iClosestSmallerValue - i;
if (dist > _prm.minBinSize)
{
// add an extra bin at the left
const size_t newLeftBinSize = dist + 1;
append(_bins, nBins, newLeftBinSize);
i += newLeftBinSize;
newBinSize -= newLeftBinSize;
}
else if ((nBins > 0) && dist > 0)
{
// no room for an extra bin to the left, extend the previous
// one if possible
const size_t nAddToPrevBin = dist + 1;
_bins[nBins - 1] += nAddToPrevBin;
i += nAddToPrevBin;
newBinSize -= nAddToPrevBin;
}
if (remainder > 0)
{ //reset bresenhams line due to unexpected change in remainder
Expand All @@ -315,9 +358,13 @@ services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndex(Nu
D = dy - _prm.maxBins + nBins + 1;
}
}

// append the bin and continue
append(_bins, nBins, newBinSize);
i += newBinSize;
}

// collect the remaining data rows in the final bin
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to distribute the residual data rows more uniformly across the bins?

Copy link
Contributor Author

@ahuber21 ahuber21 May 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, but it's what we have been doing all along. I did not change the logic, only added the comment.

Copy link
Contributor

@icfaust icfaust May 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't worry y'all. Vika, you were correct that it was bad, and was the cause of a bug. The remainder is now distributed to the various bins using bresenham's algo which should be relatively uniform (given the discrete nature). There is some interplay associated with replicated values which plays around with this some, but should be small wrt to the bin size after the remainder distribution (like a single data point or so).

if (i < nRows)
{
size_t newBinSize = nRows - i;
Expand All @@ -330,6 +377,7 @@ services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndex(Nu
_bins[nBins - 1] += newBinSize;
}
}

#if _DEBUG
#if 0
//run-time check for bins correctness
Expand All @@ -347,6 +395,61 @@ services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndex(Nu
return assignIndexAccordingToBins(entry, aRes, nBins, nRows);
}

template <typename IndexType, typename algorithmFPType, CpuType cpu>
services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndexAverages(NumericTable & nt, IndexedFeatures::FeatureEntry & entry,
IndexType * aRes, size_t iCol, size_t nRows)
{
const typename super::FeatureIdx * index = this->_index.get();

size_t nBins = 0;
size_t i = 0;
DAAL_ASSERT(_prm.maxBins > 0);
algorithmFPType binSize = (index[nRows - 1].key - index[0].key) / _prm.maxBins;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could a division by zero happen here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, because we never create the binning task when maxBins=0 is selected. Nevertheless, I have added a DAAL_ASSERT for debug later in case this gets changed

algorithmFPType value = index[0].key;

while (i < nRows)
{
// next bin border to the right of current index
size_t iRight = i + 1;

while ((iRight < nRows) && (index[iRight].key < (value + binSize)))
{
++iRight;
}

// found a new binEdge
// append the bin and continue
size_t newBinSize = iRight - i;

append(_bins, nBins, newBinSize);

i = iRight;
value = index[i].key;
}

// assert we picked up all data records
DAAL_ASSERT(i == nRows);
DAAL_ASSERT(nBins <= _prm.maxBins);

return assignIndexAccordingToBins(entry, aRes, nBins, nRows);
}

template <typename IndexType, typename algorithmFPType, CpuType cpu>
services::Status ColIndexTaskBins<IndexType, algorithmFPType, cpu>::makeIndexConstant(IndexedFeatures::FeatureEntry & entry, IndexType * aRes,
size_t nRows)
{
const typename super::FeatureIdx * index = this->_index.get();

_bins[0] = nRows;
services::internal::service_memset_seq<IndexType, cpu>(aRes, 0, nRows);

entry.numIndices = 1;
Status s = entry.allocBorders();
DAAL_CHECK(s, s);
entry.binBorders[0] = index[nRows - 1].key;
return s;
}

template <typename algorithmFPType, CpuType cpu>
services::Status IndexedFeatures::init(const NumericTable & nt, const FeatureTypes * featureTypes, const BinParams * pBimPrm)
{
Expand All @@ -368,7 +471,7 @@ services::Status IndexedFeatures::init(const NumericTable & nt, const FeatureTyp

daal::tls<TlsTask *> tlsData([=, &nt]() -> TlsTask * {
const size_t nRows = nt.getNumberOfRows();
TlsTask * res = (pBimPrm ? new BinningTask(nRows, *pBimPrm) : new DefaultTask(nRows));
TlsTask * res = (!pBimPrm || (pBimPrm->maxBins == 0)) ? new DefaultTask(nRows) : new BinningTask(nRows, *pBimPrm);
if (res && !res->isValid())
{
delete res;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1703,7 +1703,7 @@ services::Status computeForSpecificHelper(HostAppIface * pHostApp, const Numeric
{
if (!memSave)
{
BinParams prm(par.maxBins, par.minBinSize);
BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy);
s = indexedFeatures.init<algorithmFPType, cpu>(*x, &featTypes, &prm);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems the line #1078 s = indexedFeatures.init<algorithmFPType, cpu>(*x, &featTypes); should be changed accordingly, i.e. binningStrategy needs to be added there as well.

DAAL_CHECK_STATUS_VAR(s);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,7 @@ services::Status ClassificationTrainBatchKernelOneAPI<algorithmFPType, hist>::co
kernelReducePartialHistograms = kernel_factory.getKernel("reducePartialHistograms", status);
DAAL_CHECK_STATUS_VAR(status);

dtrees::internal::BinParams prm(par.maxBins, par.minBinSize);
dtrees::internal::BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy);
decision_forest::internal::IndexedFeaturesOneAPI<algorithmFPType> indexedFeatures;
dtrees::internal::FeatureTypes featTypes;

Expand Down
Loading