diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 4eb6042e8c..ec93e71152 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -33,6 +33,8 @@ === Enhancements * Upgrade Boost libraries to version 1.83. (See {ml-pull}2560[#2560].) +* Improve forecasting for time series with step changes. (See {ml-pull}#2591[2591], + issue: {ml-issue}2466[#2466]). === Bug Fixes diff --git a/include/maths/common/CNaiveBayes.h b/include/maths/common/CNaiveBayes.h index 180f5c45f3..9abb686522 100644 --- a/include/maths/common/CNaiveBayes.h +++ b/include/maths/common/CNaiveBayes.h @@ -154,19 +154,43 @@ class MATHS_COMMON_EXPORT CNaiveBayesFeatureDensityFromPrior final TPriorPtr m_Prior; }; +//! \brief Enables using custom feature weights in class prediction. +class CNaiveBayesFeatureWeight { +public: + virtual ~CNaiveBayesFeatureWeight() = default; + virtual void add(std::size_t class_, double logLikelihood) = 0; + virtual double calculate() const = 0; +}; + //! \brief Implements a Naive Bayes classifier. class MATHS_COMMON_EXPORT CNaiveBayes { public: + using TDoubleDoublePr = std::pair; using TDoubleSizePr = std::pair; using TDoubleSizePrVec = std::vector; + using TDoubleSizePrVecDoublePr = std::pair; using TDouble1Vec = core::CSmallVector; using TDouble1VecVec = std::vector; - using TOptionalDouble = std::optional; + using TFeatureWeightProvider = std::function; + +private: + //! \brief All features have unit weight in class prediction. + class CUnitFeatureWeight : public CNaiveBayesFeatureWeight { + public: + void add(std::size_t, double) override {} + double calculate() const override { return 1.0; } + }; + + class CUnitFeatureWeightProvider { + public: + CUnitFeatureWeight& operator()() const { return m_UnitWeight; } + + private: + mutable CUnitFeatureWeight m_UnitWeight; + }; public: - explicit CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, - double decayRate = 0.0, - TOptionalDouble minMaxLogLikelihoodToUseFeature = TOptionalDouble()); + explicit CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, double decayRate = 0.0); CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, const SDistributionRestoreParams& params, core::CStateRestoreTraverser& traverser); @@ -184,6 +208,9 @@ class MATHS_COMMON_EXPORT CNaiveBayes { //! Check if any training data has been added initialized. bool initialized() const; + //! Get the number of classes. + std::size_t numberClasses() const; + //! This can be used to optionally seed the class counts //! with \p counts. These are added on to data class counts //! to compute the class posterior probabilities. @@ -210,27 +237,53 @@ class MATHS_COMMON_EXPORT CNaiveBayes { //! //! \param[in] n The number of class probabilities to estimate. //! \param[in] x The feature values. + //! \param[in] weightProvider Computes a feature weight from the class + //! conditional log-likelihood of the feature value. It should be in + //! the range [0,1]. The smaller the value the less impact the feature + //! has on class selection. + //! \return The class probabilities and the minimum feature weight. //! \note \p x size should be equal to the number of features. //! A feature is missing is indicated by passing an empty vector //! for that feature. - TDoubleSizePrVec highestClassProbabilities(std::size_t n, const TDouble1VecVec& x) const; + TDoubleSizePrVecDoublePr highestClassProbabilities( + std::size_t n, + const TDouble1VecVec& x, + const TFeatureWeightProvider& weightProvider = CUnitFeatureWeightProvider{}) const; //! Get the probability of the class labeled \p label for \p x. //! //! \param[in] label The label of the class of interest. //! \param[in] x The feature values. + //! \param[in] weightProvider Computes a feature weight from the class + //! conditional log-likelihood of the feature value. It should be in + //! the range [0,1]. The smaller the value the less impact the feature + //! has on class selection. + //! \return The class probabilities and the minimum feature weight. + //! conditional distributions. //! \note \p x size should be equal to the number of features. //! A feature is missing is indicated by passing an empty vector //! for that feature. - double classProbability(std::size_t label, const TDouble1VecVec& x) const; + TDoubleDoublePr classProbability(std::size_t label, + const TDouble1VecVec& x, + const TFeatureWeightProvider& weightProvider = + CUnitFeatureWeightProvider{}) const; //! Get the probabilities of all the classes for \p x. //! //! \param[in] x The feature values. + //! \param[in] weightProvider Computes a feature weight from the class + //! conditional log-likelihood of the feature value. It should be in + //! the range [0,1]. The smaller the value the less impact the feature + //! has on class selection. + //! \return The class probabilities and the minimum feature weight. + //! A feature is missing is indicated by passing an empty vector + //! for that feature. //! \note \p x size should be equal to the number of features. //! A feature is missing is indicated by passing an empty vector //! for that feature. - TDoubleSizePrVec classProbabilities(const TDouble1VecVec& x) const; + TDoubleSizePrVecDoublePr + classProbabilities(const TDouble1VecVec& x, + const TFeatureWeightProvider& weightProvider = CUnitFeatureWeightProvider{}) const; //! Debug the memory used by this object. void debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const; @@ -298,13 +351,6 @@ class MATHS_COMMON_EXPORT CNaiveBayes { bool validate(const TDouble1VecVec& x) const; private: - //! It is not always appropriate to use features with very low - //! probability in all classes to discriminate: the class choice - //! will be very sensitive to the underlying conditional density - //! model. This is a cutoff (for the minimum maximum class log - //! likelihood) in order to use a feature. - TOptionalDouble m_MinMaxLogLikelihoodToUseFeature; - //! Controls the rate at which data are aged out. double m_DecayRate; diff --git a/lib/core/CStateRestoreTraverser.cc b/lib/core/CStateRestoreTraverser.cc index bad15cea60..a78ed3261f 100644 --- a/lib/core/CStateRestoreTraverser.cc +++ b/lib/core/CStateRestoreTraverser.cc @@ -18,8 +18,7 @@ namespace core { CStateRestoreTraverser::CStateRestoreTraverser() : m_BadState(false) { } -CStateRestoreTraverser::~CStateRestoreTraverser() { -} +CStateRestoreTraverser::~CStateRestoreTraverser() = default; bool CStateRestoreTraverser::haveBadState() const { return m_BadState; diff --git a/lib/maths/common/CNaiveBayes.cc b/lib/maths/common/CNaiveBayes.cc index d1a23cba38..9c29cefcc1 100644 --- a/lib/maths/common/CNaiveBayes.cc +++ b/lib/maths/common/CNaiveBayes.cc @@ -40,8 +40,6 @@ namespace { const core::TPersistenceTag PRIOR_TAG{"a", "prior"}; const core::TPersistenceTag CLASS_LABEL_TAG{"b", "class_label"}; const core::TPersistenceTag CLASS_MODEL_TAG{"c", "class_model"}; -const core::TPersistenceTag MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG{ - "d", "min_max_likelihood_to_use_feature"}; const core::TPersistenceTag COUNT_TAG{"e", "count"}; const core::TPersistenceTag CONDITIONAL_DENSITY_FROM_PRIOR_TAG{"f", "conditional_density_from_prior"}; } @@ -141,18 +139,18 @@ std::string CNaiveBayesFeatureDensityFromPrior::print() const { return result; } -CNaiveBayes::CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, - double decayRate, - TOptionalDouble minMaxLogLikelihoodToUseFeature) - : m_MinMaxLogLikelihoodToUseFeature{minMaxLogLikelihoodToUseFeature}, - m_DecayRate{decayRate}, m_Exemplar{exemplar.clone()}, m_ClassConditionalDensities{2} { +CNaiveBayes::CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, double decayRate) + : m_DecayRate{decayRate}, m_Exemplar{exemplar.clone()}, m_ClassConditionalDensities{2} { } CNaiveBayes::CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, const SDistributionRestoreParams& params, core::CStateRestoreTraverser& traverser) : m_DecayRate{params.s_DecayRate}, m_Exemplar{exemplar.clone()}, m_ClassConditionalDensities{2} { - if (traverser.traverseSubLevel([&](auto& traverser_) { + // If we persist before we create class conditional distributions we will + // not have anything to restore and hasSubLevel will be false. Trying to + // restore sets the traverser state to bad so we need to handle explicitly. + if (traverser.hasSubLevel() && traverser.traverseSubLevel([&](auto& traverser_) { return this->acceptRestoreTraverser(params, traverser_); }) == false) { traverser.setBadState(); @@ -160,8 +158,7 @@ CNaiveBayes::CNaiveBayes(const CNaiveBayesFeatureDensity& exemplar, } CNaiveBayes::CNaiveBayes(const CNaiveBayes& other) - : m_MinMaxLogLikelihoodToUseFeature{other.m_MinMaxLogLikelihoodToUseFeature}, - m_DecayRate{other.m_DecayRate}, m_Exemplar{other.m_Exemplar->clone()} { + : m_DecayRate{other.m_DecayRate}, m_Exemplar{other.m_Exemplar->clone()} { for (const auto& class_ : other.m_ClassConditionalDensities) { m_ClassConditionalDensities.emplace(class_.first, class_.second); } @@ -178,9 +175,6 @@ bool CNaiveBayes::acceptRestoreTraverser(const SDistributionRestoreParams& param return class_.acceptRestoreTraverser(params, traverser_); }), m_ClassConditionalDensities.emplace(label, std::move(class_))) - RESTORE_SETUP_TEARDOWN(MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, double value, - core::CStringUtils::stringToType(traverser.value(), value), - m_MinMaxLogLikelihoodToUseFeature.emplace(value)) } while (traverser.next()); return true; } @@ -203,12 +197,6 @@ void CNaiveBayes::acceptPersistInserter(core::CStatePersistInserter& inserter) c class_->second.acceptPersistInserter(inserter_); }); } - - if (m_MinMaxLogLikelihoodToUseFeature) { - inserter.insertValue(MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, - *m_MinMaxLogLikelihoodToUseFeature, - core::CIEEE754::E_SinglePrecision); - } } CNaiveBayes& CNaiveBayes::operator=(const CNaiveBayes& other) { @@ -223,11 +211,10 @@ void CNaiveBayes::swap(CNaiveBayes& other) { std::swap(m_DecayRate, other.m_DecayRate); m_Exemplar.swap(other.m_Exemplar); m_ClassConditionalDensities.swap(other.m_ClassConditionalDensities); - std::swap(m_MinMaxLogLikelihoodToUseFeature, other.m_MinMaxLogLikelihoodToUseFeature); } bool CNaiveBayes::initialized() const { - return m_ClassConditionalDensities.size() > 0 && + return m_ClassConditionalDensities.empty() == false && std::all_of(m_ClassConditionalDensities.begin(), m_ClassConditionalDensities.end(), [](const std::pair& class_) { @@ -235,6 +222,10 @@ bool CNaiveBayes::initialized() const { }); } +std::size_t CNaiveBayes::numberClasses() const { + return m_ClassConditionalDensities.size(); +} + void CNaiveBayes::initialClassCounts(const TDoubleSizePrVec& counts) { for (const auto& count : counts) { m_ClassConditionalDensities.emplace(count.second, CClass{count.first}); @@ -242,7 +233,7 @@ void CNaiveBayes::initialClassCounts(const TDoubleSizePrVec& counts) { } void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec& x) { - if (!this->validate(x)) { + if (this->validate(x) == false) { return; } @@ -257,7 +248,7 @@ void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec& bool updateCount{false}; for (std::size_t i = 0; i < x.size(); ++i) { - if (x[i].size() > 0) { + if (x[i].empty() == false) { class_.conditionalDensities()[i]->add(x[i]); updateCount = true; } @@ -288,62 +279,74 @@ void CNaiveBayes::propagateForwardsByTime(double time) { } } -CNaiveBayes::TDoubleSizePrVec -CNaiveBayes::highestClassProbabilities(std::size_t n, const TDouble1VecVec& x) const { - TDoubleSizePrVec p(this->classProbabilities(x)); +CNaiveBayes::TDoubleSizePrVecDoublePr +CNaiveBayes::highestClassProbabilities(std::size_t n, + const TDouble1VecVec& x, + const TFeatureWeightProvider& weightProvider) const { + auto[p, minFeatureWeight] = this->classProbabilities(x, weightProvider); n = std::min(n, p.size()); std::sort(p.begin(), p.begin() + n, std::greater<>()); - return TDoubleSizePrVec{p.begin(), p.begin() + n}; + return {TDoubleSizePrVec{p.begin(), p.begin() + n}, minFeatureWeight}; } -double CNaiveBayes::classProbability(std::size_t label, const TDouble1VecVec& x) const { - TDoubleSizePrVec p(this->classProbabilities(x)); +CNaiveBayes::TDoubleDoublePr +CNaiveBayes::classProbability(std::size_t label, + const TDouble1VecVec& x, + const TFeatureWeightProvider& weightProvider) const { + auto[p, minFeatureWeight] = this->classProbabilities(x, weightProvider); auto i = std::find_if(p.begin(), p.end(), [label](const TDoubleSizePr& p_) { return p_.second == label; }); - return i == p.end() ? 0.0 : i->first; + return {i == p.end() ? 0.0 : i->first, minFeatureWeight}; } -CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities(const TDouble1VecVec& x) const { - if (!this->validate(x)) { - return {}; +CNaiveBayes::TDoubleSizePrVecDoublePr +CNaiveBayes::classProbabilities(const TDouble1VecVec& x, + const TFeatureWeightProvider& weightProvider) const { + if (this->validate(x) == false) { + return {{}, 0.0}; } if (m_ClassConditionalDensities.empty()) { LOG_ERROR(<< "Trying to compute class probabilities without supplying training data"); - return {}; + return {{}, 0.0}; } using TDoubleVec = std::vector; - using TMaxAccumulator = CBasicStatistics::SMax::TAccumulator; TDoubleSizePrVec p; p.reserve(m_ClassConditionalDensities.size()); for (const auto& class_ : m_ClassConditionalDensities) { p.emplace_back(CTools::fastLog(class_.second.count()), class_.first); } + double minFeatureWeight{1.0}; TDoubleVec logLikelihoods; for (std::size_t i = 0; i < x.size(); ++i) { - if (x[i].size() > 0) { - TMaxAccumulator maxLogLikelihood; + if (x[i].empty() == false) { + auto& featureWeight = weightProvider(); logLikelihoods.clear(); for (const auto& class_ : m_ClassConditionalDensities) { const auto& density = class_.second.conditionalDensities()[i]; double logLikelihood{density->logValue(x[i])}; double logMaximumLikelihood{density->logMaximumValue()}; - maxLogLikelihood.add(logLikelihood - logMaximumLikelihood); logLikelihoods.push_back(logLikelihood); + featureWeight.add(class_.first, logLikelihood - logMaximumLikelihood); } - double weight{1.0}; - if (m_MinMaxLogLikelihoodToUseFeature) { - weight = CTools::logisticFunction( - (maxLogLikelihood[0] - *m_MinMaxLogLikelihoodToUseFeature) / - std::fabs(*m_MinMaxLogLikelihoodToUseFeature), - 0.1); - } + + // We compute the class c_i probability using + // + // p(c_i | x) = exp(sum_i{w_j * log(L(x_j | c_i))}) / Z * p(c_i). + // + // Any feature whose weight < 1 has its significance dropped in class + // selection, effectively we use the w_i'th root of the log-likelihood + // which tends to 1 for all values if w_i is small enough. This can be + // used to ignore features that for which x is the extreme tails of the + // class conditional distribution. + double featureWeight_{featureWeight.calculate()}; for (std::size_t j = 0; j < logLikelihoods.size(); ++j) { - p[j].first += weight * logLikelihoods[j]; + p[j].first += featureWeight_ * logLikelihoods[j]; } + minFeatureWeight = std::min(minFeatureWeight, featureWeight_); } } @@ -357,7 +360,7 @@ CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities(const TDouble1VecV pc.first /= Z; } - return p; + return {std::move(p), minFeatureWeight}; } void CNaiveBayes::debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const { @@ -372,7 +375,6 @@ std::size_t CNaiveBayes::memoryUsage() const { } std::uint64_t CNaiveBayes::checksum(std::uint64_t seed) const { - CChecksum::calculate(seed, m_MinMaxLogLikelihoodToUseFeature); CChecksum::calculate(seed, m_DecayRate); CChecksum::calculate(seed, m_Exemplar); return CChecksum::calculate(seed, m_ClassConditionalDensities); @@ -394,7 +396,7 @@ std::string CNaiveBayes::print() const { bool CNaiveBayes::validate(const TDouble1VecVec& x) const { auto class_ = m_ClassConditionalDensities.begin(); if (class_ != m_ClassConditionalDensities.end() && - class_->second.conditionalDensities().size() > 0 && + class_->second.conditionalDensities().empty() == false && class_->second.conditionalDensities().size() != x.size()) { LOG_ERROR(<< "Unexpected feature vector: " << x); return false; @@ -431,7 +433,7 @@ bool CNaiveBayes::CClass::acceptRestoreTraverser(const SDistributionRestoreParam void CNaiveBayes::CClass::acceptPersistInserter(core::CStatePersistInserter& inserter) const { inserter.insertValue(COUNT_TAG, m_Count, core::CIEEE754::E_SinglePrecision); for (const auto& density : m_ConditionalDensities) { - if (dynamic_cast(density.get())) { + if (dynamic_cast(density.get()) != nullptr) { inserter.insertLevel(CONDITIONAL_DENSITY_FROM_PRIOR_TAG, [&density](auto& inserter_) { density->acceptPersistInserter(inserter_); diff --git a/lib/maths/common/unittest/CLbfgsTest.cc b/lib/maths/common/unittest/CLbfgsTest.cc index f7de9827e2..38076718c4 100644 --- a/lib/maths/common/unittest/CLbfgsTest.cc +++ b/lib/maths/common/unittest/CLbfgsTest.cc @@ -228,12 +228,12 @@ BOOST_AUTO_TEST_CASE(testConstrainedMinimize) { std::tie(x, fx) = lbfgs.constrainedMinimize(f, g, a, b, x0, 0.2); BOOST_REQUIRE_EQUAL(fx, static_cast(f(x))); - BOOST_REQUIRE_CLOSE_ABSOLUTE(static_cast(f(xmin)), fx, 1e-3); + BOOST_REQUIRE_CLOSE_ABSOLUTE(static_cast(f(xmin)), fx, 5e-3); ferror += std::fabs(fx - f(xmin)) / 100.0; } - BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, ferror, 1e-5); + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, ferror, 5e-5); } BOOST_AUTO_TEST_CASE(testMinimizeWithVerySmallGradient) { diff --git a/lib/maths/common/unittest/CNaiveBayesTest.cc b/lib/maths/common/unittest/CNaiveBayesTest.cc index 535ff1e3c7..280e87c467 100644 --- a/lib/maths/common/unittest/CNaiveBayesTest.cc +++ b/lib/maths/common/unittest/CNaiveBayesTest.cc @@ -29,11 +29,13 @@ #include #include +namespace { BOOST_AUTO_TEST_SUITE(CNaiveBayesTest) using namespace ml; using TDoubleVec = std::vector; +using TDoubleVecVec = std::vector; using TDouble1Vec = core::CSmallVector; using TDouble1VecVec = std::vector; using TDoubleSizePr = std::pair; @@ -41,6 +43,12 @@ using TDoubleSizePrVec = std::vector; using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean::TAccumulator; using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar::TAccumulator; +class CTestFeatureWeight : public maths::common::CNaiveBayesFeatureWeight { +public: + void add(std::size_t, double) override {} + double calculate() const override { return 1e-3; } +}; + BOOST_AUTO_TEST_CASE(testClassification) { // We'll test classification using Gaussian naive Bayes. We // test: @@ -56,7 +64,7 @@ BOOST_AUTO_TEST_CASE(testClassification) { test::CRandomNumbers rng; - TDoubleVec trainingData[4]; + TDoubleVecVec trainingData(4); rng.generateNormalSamples(0.0, 12.0, 100, trainingData[0]); rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); rng.generateNormalSamples(3.0, 14.0, 200, trainingData[2]); @@ -92,7 +100,7 @@ BOOST_AUTO_TEST_CASE(testClassification) { // - P(1) = (initialCount + 100) / (2*initialCount + 300) // - P(2) = (initialCount + 200) / (2*initialCount + 300) - TDoubleSizePrVec probabilities(nb.highestClassProbabilities(2, {{}, {}})); + auto[probabilities, confidence](nb.highestClassProbabilities(2, {{}, {}})); double P1{(initialCount + 100.0) / (2.0 * initialCount + 300.0)}; double P2{(initialCount + 200.0) / (2.0 * initialCount + 300.0)}; @@ -157,19 +165,22 @@ BOOST_AUTO_TEST_CASE(testClassification) { maths::common::CTools::safePdf(class1[1], xtest[i + 1])}; double p2{P2 * maths::common::CTools::safePdf(class2[0], xtest[i]) * maths::common::CTools::safePdf(class2[1], xtest[i + 1])}; - probabilities = nb.highestClassProbabilities(2, {{xtest[i]}, {xtest[i + 1]}}); + std::tie(probabilities, confidence) = + nb.highestClassProbabilities(2, {{xtest[i]}, {xtest[i + 1]}}); test(p1, p2, probabilities, meanErrors[0]); // Miss out the first feature value. p1 = P1 * maths::common::CTools::safePdf(class1[1], xtest[i + 1]); p2 = P2 * maths::common::CTools::safePdf(class2[1], xtest[i + 1]); - probabilities = nb.highestClassProbabilities(2, {{}, {xtest[i + 1]}}); + std::tie(probabilities, confidence) = + nb.highestClassProbabilities(2, {{}, {xtest[i + 1]}}); test(p1, p2, probabilities, meanErrors[1]); // Miss out the second feature value. p1 = P1 * maths::common::CTools::safePdf(class1[0], xtest[i]); p2 = P2 * maths::common::CTools::safePdf(class2[0], xtest[i]); - probabilities = nb.highestClassProbabilities(2, {{xtest[i]}, {}}); + std::tie(probabilities, confidence) = + nb.highestClassProbabilities(2, {{xtest[i]}, {}}); test(p1, p2, probabilities, meanErrors[2]); } @@ -194,7 +205,7 @@ BOOST_AUTO_TEST_CASE(testUninitialized) { maths::common::CNaiveBayes nb{maths::common::CNaiveBayes{ maths::common::CNaiveBayesFeatureDensityFromPrior(normal), 0.05}}; - TDoubleVec trainingData[2]; + TDoubleVecVec trainingData(2); for (std::size_t i = 0; i < 2; ++i) { BOOST_REQUIRE_EQUAL(false, nb.initialized()); @@ -225,7 +236,7 @@ BOOST_AUTO_TEST_CASE(testPropagationByTime) { maths::common::CNaiveBayes{ maths::common::CNaiveBayesFeatureDensityFromPrior(normal), 0.05}}; - TDoubleVec trainingData[4]; + TDoubleVecVec trainingData(4); for (std::size_t i = 0; i < 1000; ++i) { double x{static_cast(i)}; rng.generateNormalSamples(0.02 * x - 14.0, 16.0, 1, trainingData[0]); @@ -248,8 +259,8 @@ BOOST_AUTO_TEST_CASE(testPropagationByTime) { { TDoubleSizePrVec probabilities[]{ - nb[0].highestClassProbabilities(2, {{-10.0}, {-10.0}}), - nb[1].highestClassProbabilities(2, {{-10.0}, {-10.0}})}; + nb[0].highestClassProbabilities(2, {{-10.0}, {-10.0}}).first, + nb[1].highestClassProbabilities(2, {{-10.0}, {-10.0}}).first}; LOG_DEBUG(<< "Aged class probabilities = " << probabilities[0]); LOG_DEBUG(<< "Class probabilities = " << probabilities[1]); BOOST_REQUIRE_EQUAL(2, probabilities[0][0].second); @@ -259,8 +270,8 @@ BOOST_AUTO_TEST_CASE(testPropagationByTime) { } { TDoubleSizePrVec probabilities[]{ - nb[0].highestClassProbabilities(2, {{10.0}, {10.0}}), - nb[1].highestClassProbabilities(2, {{10.0}, {10.0}})}; + nb[0].highestClassProbabilities(2, {{10.0}, {10.0}}).first, + nb[1].highestClassProbabilities(2, {{10.0}, {10.0}}).first}; LOG_DEBUG(<< "Aged class probabilities = " << probabilities[0]); LOG_DEBUG(<< "Class probabilities = " << probabilities[1]); BOOST_REQUIRE_EQUAL(1, probabilities[0][0].second); @@ -270,6 +281,42 @@ BOOST_AUTO_TEST_CASE(testPropagationByTime) { } } +BOOST_AUTO_TEST_CASE(testExtrapolation) { + // Test that: + // 1. Applying low feature weights means the conditional probabilies + // of all classes tend to 1 / "number of classes", + // 2. That the number returned as a confidence in the probabilities + // is the minimum feature weight. + + test::CRandomNumbers rng; + + TDoubleVecVec trainingData(2); + rng.generateNormalSamples(0.0, 12.0, 100, trainingData[0]); + rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); + + maths::common::CNormalMeanPrecConjugate normal{ + maths::common::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData)}; + maths::common::CNaiveBayes nb{maths::common::CNaiveBayesFeatureDensityFromPrior(normal)}; + + for (auto x : trainingData[0]) { + nb.addTrainingDataPoint(0, {{x}}); + } + for (auto x : trainingData[1]) { + nb.addTrainingDataPoint(1, {{x}}); + } + + auto weightProvider = [weight = CTestFeatureWeight()]() mutable->maths::common::CNaiveBayesFeatureWeight& { + return weight; + }; + auto[probabilities, confidence] = nb.classProbabilities({{30.0}}, weightProvider); + LOG_DEBUG(<< "p = " << probabilities << ", confidence = " << confidence); + + BOOST_REQUIRE_EQUAL(2, probabilities.size()); + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.5, probabilities[0].first, 1e-2); + BOOST_REQUIRE_CLOSE_ABSOLUTE(0.5, probabilities[1].first, 1e-2); + BOOST_REQUIRE_EQUAL(1e-3, confidence); +} + BOOST_AUTO_TEST_CASE(testMemoryUsage) { // Check invariants. @@ -313,7 +360,7 @@ BOOST_AUTO_TEST_CASE(testMemoryUsage) { BOOST_AUTO_TEST_CASE(testPersist) { test::CRandomNumbers rng; - TDoubleVec trainingData[4]; + TDoubleVecVec trainingData(4); rng.generateNormalSamples(0.0, 12.0, 100, trainingData[0]); rng.generateNormalSamples(10.0, 16.0, 100, trainingData[1]); rng.generateNormalSamples(3.0, 14.0, 200, trainingData[2]); @@ -364,3 +411,4 @@ BOOST_AUTO_TEST_CASE(testPersist) { } BOOST_AUTO_TEST_SUITE_END() +} diff --git a/lib/maths/time_series/CTrendComponent.cc b/lib/maths/time_series/CTrendComponent.cc index 1b7f676305..78f9a27216 100644 --- a/lib/maths/time_series/CTrendComponent.cc +++ b/lib/maths/time_series/CTrendComponent.cc @@ -52,6 +52,27 @@ const core_t::TTime UNSET_TIME{0}; const std::size_t NO_CHANGE_LABEL{0}; const std::size_t LEVEL_CHANGE_LABEL{1}; +class CChangeForecastFeatureWeight : public common::CNaiveBayesFeatureWeight { +public: + void add(std::size_t class_, double logLikelihood) override { + if (class_ == NO_CHANGE_LABEL) { + m_LogLikelihood = logLikelihood; + } + } + + double calculate() const override { + // Downweight features for which we don't have sufficient examples + // of the time series not changing. + // Note that m_LogLikelihood = 0.5 * (x - m)^2 / sigma^2 so 4.5 + // corresponds to the case the feature value is at the 3 sigma + // point of the conditional distribution. + return common::CTools::logisticFunction((4.5 + m_LogLikelihood) / 4.5, 0.1); + } + +private: + double m_LogLikelihood{0.0}; +}; + //! Get the desired weight for the regression model. double modelWeight(double targetDecayRate, double modelDecayRate) { return targetDecayRate == modelDecayRate @@ -90,7 +111,7 @@ common::CNaiveBayesFeatureDensityFromPrior naiveBayesExemplar(double decayRate) common::CNaiveBayes initialProbabilityOfChangeModel(double decayRate) { return common::CNaiveBayes{naiveBayesExemplar(decayRate), - TIME_SCALES[NUMBER_MODELS - 1] * decayRate, -20.0}; + TIME_SCALES[NUMBER_MODELS - 1] * decayRate}; } common::CNormalMeanPrecConjugate initialMagnitudeOfChangeModel(double decayRate) { @@ -294,11 +315,11 @@ void CTrendComponent::shiftLevel(double shift, m_ProbabilityOfLevelChangeModel.addTrainingDataPoint(LEVEL_CHANGE_LABEL, {{dt}, {value}}); } + m_TimeOfLastLevelChange = time; for (std::size_t i = segments[last]; i < values.size(); ++i, time += bucketLength) { this->dontShiftLevel(time, common::CBasicStatistics::mean(values[i])); } m_MagnitudeOfLevelChangeModel.addSamples({magnitude}, maths_t::CUnitWeights::SINGLE_UNIT); - m_TimeOfLastLevelChange = time; } void CTrendComponent::dontShiftLevel(core_t::TTime time, double value) { @@ -786,14 +807,26 @@ CTrendComponent::TDouble3Vec CTrendComponent::CForecastLevel::forecast(core_t::TTime time, double prediction, double confidence) { TDouble3Vec result{0.0, 0.0, 0.0}; - if (m_Probability.initialized()) { + if (m_Probability.initialized() && m_Probability.numberClasses() > 1) { common::CSampling::uniformSample(0.0, 1.0, m_Levels.size(), m_Uniform01); bool reorder{false}; + auto weightProvider = [weight = + CChangeForecastFeatureWeight{}]() mutable->common::CNaiveBayesFeatureWeight& { + weight = CChangeForecastFeatureWeight{}; + return weight; + }; for (std::size_t i = 0; i < m_Levels.size(); ++i) { double dt{static_cast(time - m_TimesOfLastChange[i])}; double x{m_Levels[i] + prediction}; - double p{m_Probability.classProbability(LEVEL_CHANGE_LABEL, {{dt}, {x}})}; - m_ProbabilitiesOfChange[i] = std::max(m_ProbabilitiesOfChange[i], p); + auto[p, pConfidence] = m_Probability.classProbability( + LEVEL_CHANGE_LABEL, {{dt}, {x}}, weightProvider); + // Here we decide whether to increase the probability we should have + // seen a step change for this rollout. If we are no longer confident + // in our predicted probability we do not predict changes based on + // the principle of least surprise. + if (pConfidence > 0.5) { + m_ProbabilitiesOfChange[i] = std::max(m_ProbabilitiesOfChange[i], p); + } if (m_Uniform01[i] < m_ProbabilitiesOfChange[i]) { double stepMean{m_Magnitude.marginalLikelihoodMean()}; double stepVariance{m_Magnitude.marginalLikelihoodVariance()}; diff --git a/lib/maths/time_series/unittest/CTrendComponentTest.cc b/lib/maths/time_series/unittest/CTrendComponentTest.cc index 618e5104e8..e9278b0ec5 100644 --- a/lib/maths/time_series/unittest/CTrendComponentTest.cc +++ b/lib/maths/time_series/unittest/CTrendComponentTest.cc @@ -26,15 +26,15 @@ #include +#include #include #include +namespace { BOOST_AUTO_TEST_SUITE(CTrendComponentTest) using namespace ml; -namespace { - using TDoubleVec = std::vector; using TDoubleVecVec = std::vector; using TDouble1Vec = core::CSmallVector; @@ -223,7 +223,6 @@ auto forecastErrors(ITR actual, return std::make_pair(maths::common::CBasicStatistics::mean(meanError), maths::common::CBasicStatistics::mean(meanErrorAt95)); } -} BOOST_AUTO_TEST_CASE(testValueAndVariance) { // Check that the prediction bias is small in the long run @@ -360,6 +359,71 @@ BOOST_AUTO_TEST_CASE(testForecast) { BOOST_TEST_REQUIRE(errorAt95 < 0.001); } +BOOST_AUTO_TEST_CASE(testStepChangeForecasting) { + // A randomized test that forecasts of time series with step changes + // don't explode. We previously sometimes ran into issues when we + // extrapolated the feature distributions we use to predict steps. + // In such cases we would predict far too many steps leading to + // overly wide forecast bounds and unrealistic predictions. + + using TSizeVec = std::vector; + + test::CRandomNumbers rng; + double interval{20.0}; + + maths::time_series::CTrendComponent::TFloatMeanAccumulatorVec values; + + for (std::size_t t = 0; t < 100; ++t) { + TSizeVec changePoints; + rng.generateUniformSamples(0, 1000, 6, changePoints); + std::sort(changePoints.begin(), changePoints.end()); + changePoints.push_back(1000); + TDoubleVec levels; + rng.generateUniformSamples(-0.5 * interval, 0.5 * interval, 7, levels); + + maths::time_series::CTrendComponent trendModel{0.012}; + + TDoubleVec noise; + auto level = levels.begin(); + auto changePoint = changePoints.begin(); + core_t::TTime time{1672531200}; + for (std::size_t i = 0; i < 1000; ++i, time += BUCKET_LENGTH) { + rng.generateNormalSamples(0.0, 0.25, 1, noise); + double value{*level + noise[0]}; + trendModel.add(time, value); + values.emplace_back().add(value); + if (i == *changePoint) { + ++level; + ++changePoint; + double shift{*level - *(level - 1)}; + core_t::TTime valuesStartTime{ + time - static_cast(values.size()) * BUCKET_LENGTH}; + TSizeVec segments{0, *changePoint - *(changePoint - 1) - 1, + *changePoint - *(changePoint - 1)}; + TDoubleVec shifts{0.0, *level - *(level - 1)}; + trendModel.shiftLevel(shift, valuesStartTime, BUCKET_LENGTH, + values, segments, shifts); + values.clear(); + } else { + trendModel.dontShiftLevel(time, value); + } + } + + TDouble3VecVec forecast; + trendModel.forecast(time, time + 200 * BUCKET_LENGTH, BUCKET_LENGTH, 90.0, false, + [](core_t::TTime) { return TDouble3Vec(3, 0.0); }, + [&forecast](core_t::TTime, const TDouble3Vec& value) { + forecast.push_back(value); + }); + + // Check that the prediction is in the switching interval and + // the forecast confidence interval isn't too wide. + BOOST_TEST_REQUIRE(forecast.back()[1] > -0.75 * interval); + BOOST_TEST_REQUIRE(forecast.back()[1] < 0.75 * interval); + BOOST_TEST_REQUIRE(forecast.back()[2] - forecast.back()[0] < 3.5 * interval); + } +} + BOOST_AUTO_TEST_CASE(testPersist) { // Check that serialization is idempotent. @@ -393,9 +457,9 @@ BOOST_AUTO_TEST_CASE(testPersist) { maths::common::SDistributionRestoreParams params{maths_t::E_ContinuousData, 0.1}; maths::time_series::CTrendComponent restoredComponent{0.1}; - traverser.traverseSubLevel( - std::bind(&maths::time_series::CTrendComponent::acceptRestoreTraverser, - &restoredComponent, std::cref(params), std::placeholders::_1)); + traverser.traverseSubLevel([&](auto& traverser_) { + return restoredComponent.acceptRestoreTraverser(params, traverser_); + }); BOOST_REQUIRE_EQUAL(origComponent.checksum(), restoredComponent.checksum()); @@ -440,9 +504,9 @@ BOOST_AUTO_TEST_CASE(testUpgradeTo7p1) { core::CRapidXmlParser parser; BOOST_TEST_REQUIRE(parser.parseStringIgnoreCdata(xml)); core::CRapidXmlStateRestoreTraverser traverser(parser); - traverser.traverseSubLevel( - std::bind(&maths::time_series::CTrendComponent::acceptRestoreTraverser, - &component, std::cref(params), std::placeholders::_1)); + traverser.traverseSubLevel([&](auto& traverser_) { + return component.acceptRestoreTraverser(params, traverser_); + }); test::CRandomNumbers rng; @@ -455,3 +519,4 @@ BOOST_AUTO_TEST_CASE(testUpgradeTo7p1) { } BOOST_AUTO_TEST_SUITE_END() +} diff --git a/lib/model/unittest/CMetricDataGathererTest.cc b/lib/model/unittest/CMetricDataGathererTest.cc index 50a6b2ffdd..2a6609ec88 100644 --- a/lib/model/unittest/CMetricDataGathererTest.cc +++ b/lib/model/unittest/CMetricDataGathererTest.cc @@ -1476,13 +1476,13 @@ BOOST_FIXTURE_TEST_CASE(testInfluenceStatistics, CTestFixture) { TStrVec influencerNames(std::begin(influencerNames_), std::end(influencerNames_)); CDataGatherer gatherer(model_t::E_Metric, model_t::E_None, params, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, - influencerNames, KEY, features, startTime, 2u); + influencerNames, KEY, features, startTime, 2); addPerson("p1", gatherer, m_ResourceMonitor, influencerNames.size()); addPerson("p2", gatherer, m_ResourceMonitor, influencerNames.size()); core_t::TTime bucketStart = startTime; - for (std::size_t i = 0u, b = 0; i < std::size(data); ++i) { + for (std::size_t i = 0; i < std::size(data); ++i) { if (data[i].get<0>() >= bucketStart + bucketLength) { LOG_DEBUG(<< "*** processing bucket ***"); TFeatureSizeFeatureDataPrVecPrVec featureData; @@ -1520,7 +1520,6 @@ BOOST_FIXTURE_TEST_CASE(testInfluenceStatistics, CTestFixture) { } bucketStart += bucketLength; - ++b; } for (std::size_t pid = 0; pid < gatherer.numberActivePeople(); ++pid) { addArrival(gatherer, m_ResourceMonitor, data[i].get<0>(), diff --git a/lib/model/unittest/CMetricPopulationDataGathererTest.cc b/lib/model/unittest/CMetricPopulationDataGathererTest.cc index c4dce0d474..dea04a2880 100644 --- a/lib/model/unittest/CMetricPopulationDataGathererTest.cc +++ b/lib/model/unittest/CMetricPopulationDataGathererTest.cc @@ -975,10 +975,10 @@ BOOST_FIXTURE_TEST_CASE(testInfluenceStatistics, CTestFixture) { TStrVec influencerNames(std::begin(influencerNames_), std::end(influencerNames_)); CDataGatherer gatherer(model_t::E_PopulationMetric, model_t::E_None, params, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, EMPTY_STRING, - influencerNames, searchKey, features, startTime, 2u); + influencerNames, searchKey, features, startTime, 2); core_t::TTime bucketStart = startTime; - for (std::size_t i = 0u, b = 0; i < std::size(data); ++i) { + for (std::size_t i = 0; i < std::size(data); ++i) { if (data[i].s_Time >= bucketStart + bucketLength) { LOG_DEBUG(<< "*** processing bucket ***"); TFeatureSizeSizePrFeatureDataPrVecPrVec featureData; @@ -1016,7 +1016,6 @@ BOOST_FIXTURE_TEST_CASE(testInfluenceStatistics, CTestFixture) { } bucketStart += bucketLength; - ++b; } addArrival(data[i], gatherer, m_ResourceMonitor); } diff --git a/lib/model/unittest/CResourceMonitorTest.cc b/lib/model/unittest/CResourceMonitorTest.cc index fc7931d603..5bcadce756 100644 --- a/lib/model/unittest/CResourceMonitorTest.cc +++ b/lib/model/unittest/CResourceMonitorTest.cc @@ -56,8 +56,6 @@ class CTestFixture { CHierarchicalResults results; std::string pervasive("IShouldNotBeRemoved"); - std::size_t numBuckets = 0; - for (core_t::TTime time = firstTime; time < static_cast(firstTime + bucketLength * buckets); time += (bucketLength / std::max(std::size_t(1), newPeoplePerBucket))) { @@ -66,7 +64,6 @@ class CTestFixture { detector.buildResults(bucketStart, bucketStart + bucketLength, results); monitor.pruneIfRequired(bucketStart); monitor.updateMoments(monitor.totalMemory(), bucketStart, bucketLength); - ++numBuckets; newBucket = true; }