From ba31ad1abaaee0a12dc6610df48ab9180b539f42 Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Sat, 6 Aug 2022 17:16:50 +0200 Subject: [PATCH 1/2] [RF] Don't skip zero-weight events by default in new BatchMode The new RooFit batchMode skipped zero-weight events to optimize the likelihood calculation. However, this should not be done in general, because it is unexpected to users is the output of batched computations is not aligned with the original dataset. --- .../res/RooFit/BatchModeDataHelpers.h | 7 ++- roofit/roofitcore/res/RooFitDriver.h | 2 +- .../roofitcore/src/BatchModeDataHelpers.cxx | 51 ++++++++++--------- roofit/roofitcore/src/BatchModeHelpers.cxx | 16 +++--- roofit/roofitcore/src/RooFitDriver.cxx | 5 +- 5 files changed, 43 insertions(+), 38 deletions(-) diff --git a/roofit/roofitcore/res/RooFit/BatchModeDataHelpers.h b/roofit/roofitcore/res/RooFit/BatchModeDataHelpers.h index 38dbebe345228..c4ff48bc86886 100644 --- a/roofit/roofitcore/res/RooFit/BatchModeDataHelpers.h +++ b/roofit/roofitcore/res/RooFit/BatchModeDataHelpers.h @@ -30,10 +30,9 @@ class TNamed; namespace RooFit { namespace BatchModeDataHelpers { -std::map> getDataSpans(RooAbsData const &data, - std::string_view rangeName, - RooAbsCategory const *indexCat, - std::stack> &buffers); +std::map> +getDataSpans(RooAbsData const &data, std::string_view rangeName, RooAbsCategory const *indexCat, + std::stack> &buffers, bool skipZeroWeights); } // namespace BatchModeDataHelpers } // namespace RooFit diff --git a/roofit/roofitcore/res/RooFitDriver.h b/roofit/roofitcore/res/RooFitDriver.h index 2485ac0080041..83e685b1ed350 100644 --- a/roofit/roofitcore/res/RooFitDriver.h +++ b/roofit/roofitcore/res/RooFitDriver.h @@ -50,7 +50,7 @@ class RooFitDriver { RooFit::BatchModeOption batchMode = RooFit::BatchModeOption::Cpu); void setData(RooAbsData const &data, std::string_view rangeName = "", - RooAbsCategory const *indexCatForSplitting = nullptr); + RooAbsCategory const *indexCatForSplitting = nullptr, bool skipZeroWeights = false); void setData(DataSpansMap const &dataSpans); ~RooFitDriver(); diff --git a/roofit/roofitcore/src/BatchModeDataHelpers.cxx b/roofit/roofitcore/src/BatchModeDataHelpers.cxx index b3f0a0e16a80a..3bc1f7c818260 100644 --- a/roofit/roofitcore/src/BatchModeDataHelpers.cxx +++ b/roofit/roofitcore/src/BatchModeDataHelpers.cxx @@ -22,8 +22,8 @@ namespace { -void splitByCategory(std::map> &dataSpans, RooAbsCategory const &category, - std::stack> &buffers) +void splitByCategory(std::map> &dataSpans, + RooAbsCategory const &category, std::stack> &buffers) { std::stack> oldBuffers; std::swap(buffers, oldBuffers); @@ -71,28 +71,33 @@ void splitByCategory(std::map> &d } // namespace //////////////////////////////////////////////////////////////////////////////// -// Extract all content from a RooFit datasets as a map of spans. -// Spans with the weights and squared weights will be also stored in the map, -// keyed with the names `_weight` and the `_weight_sumW2`. If the dataset is -// unweighted, these weight spans will only contain the single value `1.0`. -// Entries with zero weight will be skipped. -// -// \return A `std::map` with spans keyed to name pointers. -// \param[in] data The input dataset. -// \param[in] rangeName Select only entries from the data in a given range -// (empty string for no range). -// \param[in] indexCat If not `nullptr`, each span is spit up by this category, -// with the new names prefixed by the category component name -// surrounded by underscores. For example, if you have a category -// with `signal` and `control` samples, the span for a variable `x` -// will be split in two spans `_signal_x` and `_control_x`. -// \param[in] buffers Pass here an empty stack of `double` vectors, which will -// be used as memory for the data if the memory in the dataset -// object can't be used directly (e.g. because you used the range -// selection or the splitting by categories). +/// Extract all content from a RooFit datasets as a map of spans. +/// Spans with the weights and squared weights will be also stored in the map, +/// keyed with the names `_weight` and the `_weight_sumW2`. If the dataset is +/// unweighted, these weight spans will only contain the single value `1.0`. +/// Entries with zero weight will be skipped. +/// +/// \return A `std::map` with spans keyed to name pointers. +/// \param[in] data The input dataset. +/// \param[in] rangeName Select only entries from the data in a given range +/// (empty string for no range). +/// \param[in] indexCat If not `nullptr`, each span is spit up by this category, +/// with the new names prefixed by the category component name +/// surrounded by underscores. For example, if you have a category +/// with `signal` and `control` samples, the span for a variable `x` +/// will be split in two spans `_signal_x` and `_control_x`. +/// \param[in] buffers Pass here an empty stack of `double` vectors, which will +/// be used as memory for the data if the memory in the dataset +/// object can't be used directly (e.g. because you used the range +/// selection or the splitting by categories). +/// \param[in] skipZeroWeights Skip entries with zero weight when filling the +/// data spans. Be very careful with enabling it, because the user +/// might not expect that the batch results are not aligned with the +/// original dataset anymore! std::map> RooFit::BatchModeDataHelpers::getDataSpans(RooAbsData const &data, std::string_view rangeName, - RooAbsCategory const *indexCat, std::stack> &buffers) + RooAbsCategory const *indexCat, std::stack> &buffers, + bool skipZeroWeights) { std::map> dataSpans; // output variable @@ -137,7 +142,7 @@ RooFit::BatchModeDataHelpers::getDataSpans(RooAbsData const &data, std::string_v buffer.reserve(nEvents); bufferSumW2.reserve(nEvents); for (std::size_t i = 0; i < nEvents; ++i) { - if (weight[i] != 0) { + if (!skipZeroWeights || weight[i] != 0) { buffer.push_back(weight[i]); bufferSumW2.push_back(weightSumW2[i]); ++nNonZeroWeight; diff --git a/roofit/roofitcore/src/BatchModeHelpers.cxx b/roofit/roofitcore/src/BatchModeHelpers.cxx index 2530e0806733b..14246cd1bfb05 100644 --- a/roofit/roofitcore/src/BatchModeHelpers.cxx +++ b/roofit/roofitcore/src/BatchModeHelpers.cxx @@ -116,17 +116,17 @@ RooFit::BatchModeHelpers::createNLL(RooAbsPdf &pdf, RooAbsData &data, std::uniqu nll->addOwnedComponents(std::move(binSamplingPdfs)); nll->addOwnedComponents(std::move(nllTerms)); + RooAbsCategory const *indexCatForSplitting = nullptr; if (auto simPdf = dynamic_cast(&finalPdf)) { RooArgSet parameters; pdf.getParameters(data.get(), parameters); nll->recursiveRedirectServers(parameters); - driver = std::make_unique(*nll, observables, batchMode); - driver->setData(data, rangeName, &simPdf->indexCat()); - } else { - driver = std::make_unique(*nll, observables, batchMode); - driver->setData(data, rangeName); + indexCatForSplitting = &simPdf->indexCat(); } + driver = std::make_unique(*nll, observables, batchMode); + driver->setData(data, rangeName, indexCatForSplitting, /*skipZeroWeights=*/true); + // Set the fitrange attribute so that RooPlot can automatically plot the fitting range by default if (!rangeName.empty()) { @@ -219,11 +219,11 @@ class RooAbsRealWrapper final : public RooAbsReal { double defaultErrorLevel() const override { return _driver->topNode().defaultErrorLevel(); } - bool getParameters(const RooArgSet * observables, RooArgSet &outputSet, - bool /*stripDisconnected=true*/) const override + bool + getParameters(const RooArgSet *observables, RooArgSet &outputSet, bool /*stripDisconnected=true*/) const override { outputSet.add(_parameters); - if(observables) { + if (observables) { outputSet.remove(*observables); } return false; diff --git a/roofit/roofitcore/src/RooFitDriver.cxx b/roofit/roofitcore/src/RooFitDriver.cxx index 07086ed4007b1..4f51e47f8bb0b 100644 --- a/roofit/roofitcore/src/RooFitDriver.cxx +++ b/roofit/roofitcore/src/RooFitDriver.cxx @@ -186,9 +186,10 @@ RooFitDriver::RooFitDriver(const RooAbsReal &absReal, RooArgSet const &normSet, } void RooFitDriver::setData(RooAbsData const &data, std::string_view rangeName, - RooAbsCategory const *indexCatForSplitting) + RooAbsCategory const *indexCatForSplitting, bool skipZeroWeights) { - setData(RooFit::BatchModeDataHelpers::getDataSpans(data, rangeName, indexCatForSplitting, _vectorBuffers)); + setData(RooFit::BatchModeDataHelpers::getDataSpans(data, rangeName, indexCatForSplitting, _vectorBuffers, + skipZeroWeights)); } void RooFitDriver::setData(DataSpansMap const &dataSpans) From 4cecb7e9fc4312091d17d393f22bc01ee18f5acf Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Sat, 6 Aug 2022 17:22:14 +0200 Subject: [PATCH 2/2] [RF] Ensure norm set args are part of the graph in NormalizationHelpers Add the arg from the actual node list in the computation graph. Like this, we don't accidentally add internal variable clones that the client args returned. Looking this up is fast because of the name pointer hash map optimization. This is done because it will prevent crashes in future RooFit developments that yet have to be commited. --- roofit/roofitcore/src/NormalizationHelpers.cxx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/roofit/roofitcore/src/NormalizationHelpers.cxx b/roofit/roofitcore/src/NormalizationHelpers.cxx index 9e52c0c571b79..e0d72a5d376cb 100644 --- a/roofit/roofitcore/src/NormalizationHelpers.cxx +++ b/roofit/roofitcore/src/NormalizationHelpers.cxx @@ -170,8 +170,9 @@ void treeNodeServerListAndNormSets(const RooAbsArg &arg, RooAbsCollection &list, } auto differentSet = arg.fillNormSetForServer(normSet, *server); - if (differentSet) + if (differentSet) { differentSet->sort(); + } auto &serverNormSet = differentSet ? *differentSet : normSet; @@ -219,14 +220,17 @@ std::vector> unfoldIntegrals(RooAbsArg const &topNode treeNodeServerListAndNormSets(topNode, nodes, normSetSorted, normSets, checker); // Clean normsets of the variables that the arg does not depend on - // std::unordered_map,bool> dependsResults; for (auto &item : normSets) { if (!item.second || item.second->empty()) continue; auto actualNormSet = new RooArgSet{}; for (auto *narg : *item.second) { if (checker.dependsOn(item.first, narg)) - actualNormSet->add(*narg); + // Add the arg from the actual node list in the computation graph. + // Like this, we don't accidentally add internal variable clones + // that the client args returned. Looking this up is fast because + // of the name pointer hash map optimization. + actualNormSet->add(*nodes.find(*narg)); } delete item.second; item.second = actualNormSet;