Skip to content

Commit

Permalink
Partitioner for multi-target tree. (#8922)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Mar 16, 2023
1 parent 26209a4 commit a093770
Show file tree
Hide file tree
Showing 8 changed files with 239 additions and 178 deletions.
43 changes: 19 additions & 24 deletions src/common/partition_builder.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*!
* Copyright 2021-2022 by Contributors
/**
* Copyright 2021-2023 by Contributors
* \file row_set.h
* \brief Quick Utility to compute subset of rows
* \author Philip Cho, Tianqi Chen
Expand All @@ -10,6 +10,7 @@
#include <xgboost/data.h>

#include <algorithm>
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <utility>
Expand All @@ -21,9 +22,7 @@
#include "xgboost/context.h"
#include "xgboost/tree_model.h"

namespace xgboost {
namespace common {

namespace xgboost::common {
// The builder is required for samples partition to left and rights children for set of nodes
// Responsible for:
// 1) Effective memory allocation for intermediate results for multi-thread work
Expand Down Expand Up @@ -109,18 +108,17 @@ class PartitionBuilder {
return {nleft_elems, nright_elems};
}

template <typename BinIdxType, bool any_missing, bool any_cat>
void Partition(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
const common::Range1d range,
const bst_bin_t split_cond, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, const bst_bin_t split_cond,
GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
std::size_t nid = nodes[node_in_set].nid;
bst_feature_t fid = tree[nid].SplitIndex();
bool default_left = tree[nid].DefaultLeft();
bst_feature_t fid = tree.SplitIndex(nid);
bool default_left = tree.DefaultLeft(nid);
bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
auto node_cats = tree.NodeCats(nid);
auto const& cut_values = gmat.cut.Values();
Expand Down Expand Up @@ -190,10 +188,10 @@ class PartitionBuilder {
* worker, so we go through all the rows and mark the bit vectors on whether the decision is made
* to go right, or if the feature value used for the split is missing.
*/
void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
template <typename ExpandEntry>
void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
const RegTree& tree, const size_t* rid,
const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
BitVector* decision_bits, BitVector* missing_bits) {
common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
std::size_t nid = nodes[node_in_set].nid;
Expand Down Expand Up @@ -228,8 +226,8 @@ class PartitionBuilder {
* @brief Once we've aggregated the decision and missing bits from all the workers, we can then
* use them to partition the rows accordingly.
*/
void PartitionByMask(const size_t node_in_set,
std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
template <typename ExpandEntry>
void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
const common::Range1d range, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix, const RegTree& tree,
const size_t* rid, BitVector const& decision_bits,
Expand Down Expand Up @@ -293,11 +291,11 @@ class PartitionBuilder {
}


size_t GetNLeftElems(int nid) const {
[[nodiscard]] std::size_t GetNLeftElems(int nid) const {
return left_right_nodes_sizes_[nid].first;
}

size_t GetNRightElems(int nid) const {
[[nodiscard]] std::size_t GetNRightElems(int nid) const {
return left_right_nodes_sizes_[nid].second;
}

Expand Down Expand Up @@ -349,7 +347,7 @@ class PartitionBuilder {
if (node.node_id < 0) {
return;
}
CHECK(tree[node.node_id].IsLeaf());
CHECK(tree.IsLeaf(node.node_id));
if (node.begin) { // guard for empty node.
size_t ptr_offset = node.end - p_begin;
CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
Expand Down Expand Up @@ -384,8 +382,5 @@ class PartitionBuilder {
std::vector<std::shared_ptr<BlockInfo>> mem_blocks_;
size_t max_n_tasks_ = 0;
};

} // namespace common
} // namespace xgboost

} // namespace xgboost::common
#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_
4 changes: 2 additions & 2 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,8 @@ struct LearnerTrainParam : public XGBoostParameter<LearnerTrainParam> {
.add_enum("monolithic", MultiStrategy::kMonolithic)
.set_default(MultiStrategy::kComposite)
.describe(
"Strategy used for training multi-target models. `mono` means building one single tree "
"for all targets.");
"Strategy used for training multi-target models. `monolithic` means building one "
"single tree for all targets.");
}
};

Expand Down
102 changes: 57 additions & 45 deletions src/tree/common_row_partitioner.h
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
/*!
* Copyright 2021-2022 XGBoost contributors
/**
* Copyright 2021-2023 XGBoost contributors
* \file common_row_partitioner.h
* \brief Common partitioner logic for hist and approx methods.
*/
#ifndef XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
#define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_

#include <algorithm> // std::all_of
#include <cinttypes> // std::uint32_t
#include <limits> // std::numeric_limits
#include <vector>

#include "../collective/communicator-inl.h"
#include "../common/linalg_op.h" // cbegin
#include "../common/numeric.h" // Iota
#include "../common/partition_builder.h"
#include "hist/expand_entry.h" // CPUExpandEntry
#include "xgboost/base.h"
#include "xgboost/context.h" // Context
#include "xgboost/linalg.h" // TensorView

namespace xgboost {
namespace tree {
namespace xgboost::tree {

static constexpr size_t kPartitionBlockSize = 2048;

Expand All @@ -34,9 +38,10 @@ class ColumnSplitHelper {
missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
}

template <typename ExpandEntry>
void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
// When data is split by column, we don't have all the feature values in the local worker, so
// we first collect all the decisions and whether the feature is missing into bit vectors.
std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
Expand Down Expand Up @@ -97,17 +102,18 @@ class CommonRowPartitioner {
}
}

void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
template <typename ExpandEntry>
void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
auto const& ptrs = gmat.cut.Ptrs();
auto const& vals = gmat.cut.Values();

for (std::size_t i = 0; i < nodes.size(); ++i) {
bst_node_t const nid = nodes[i].nid;
bst_feature_t const fid = tree[nid].SplitIndex();
const float split_pt = tree[nid].SplitCond();
const uint32_t lower_bound = ptrs[fid];
const uint32_t upper_bound = ptrs[fid + 1];
bst_node_t const nidx = nodes[i].nid;
bst_feature_t const fidx = tree.SplitIndex(nidx);
float const split_pt = tree.SplitCond(nidx);
std::uint32_t const lower_bound = ptrs[fidx];
std::uint32_t const upper_bound = ptrs[fidx + 1];
bst_bin_t split_cond = -1;
// convert floating-point split_pt into corresponding bin_id
// split_cond = -1 indicates that split_pt is less than all known cut points
Expand All @@ -121,20 +127,22 @@ class CommonRowPartitioner {
}
}

void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree const* p_tree) {
template <typename ExpandEntry>
void AddSplitsToRowSet(const std::vector<ExpandEntry>& nodes, RegTree const* p_tree) {
const size_t n_nodes = nodes.size();
for (unsigned int i = 0; i < n_nodes; ++i) {
const int32_t nid = nodes[i].nid;
const int32_t nidx = nodes[i].nid;
const size_t n_left = partition_builder_.GetNLeftElems(i);
const size_t n_right = partition_builder_.GetNRightElems(i);
CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild());
row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(),
n_left, n_right);
CHECK_EQ(p_tree->LeftChild(nidx) + 1, p_tree->RightChild(nidx));
row_set_collection_.AddSplit(nidx, p_tree->LeftChild(nidx), p_tree->RightChild(nidx), n_left,
n_right);
}
}

template <typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
auto const& column_matrix = gmat.Transpose();
if (column_matrix.IsInitialized()) {
if (gmat.cut.HasCategorical()) {
Expand All @@ -152,44 +160,32 @@ class CommonRowPartitioner {
}
}

template <bool any_cat>
template <bool any_cat, typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
if (column_matrix.AnyMissing()) {
this->template UpdatePosition<true, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
} else {
this->template UpdatePosition<false, any_cat>(ctx, gmat, column_matrix, nodes, p_tree);
}
}

template <bool any_missing, bool any_cat>
template <bool any_missing, bool any_cat, typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
switch (column_matrix.GetTypeSize()) {
case common::kUint8BinsTypeSize:
this->template UpdatePosition<uint8_t, any_missing, any_cat>(ctx, gmat, column_matrix,
nodes, p_tree);
break;
case common::kUint16BinsTypeSize:
this->template UpdatePosition<uint16_t, any_missing, any_cat>(ctx, gmat, column_matrix,
nodes, p_tree);
break;
case common::kUint32BinsTypeSize:
this->template UpdatePosition<uint32_t, any_missing, any_cat>(ctx, gmat, column_matrix,
nodes, p_tree);
break;
default:
// no default behavior
CHECK(false) << column_matrix.GetTypeSize();
}
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto t) {
using T = decltype(t);
this->template UpdatePosition<T, any_missing, any_cat>(ctx, gmat, column_matrix, nodes,
p_tree);
});
}

template <typename BinIdxType, bool any_missing, bool any_cat>
template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
void UpdatePosition(Context const* ctx, GHistIndexMatrix const& gmat,
const common::ColumnMatrix& column_matrix,
std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
// 1. Find split condition for each split
size_t n_nodes = nodes.size();

Expand Down Expand Up @@ -251,9 +247,9 @@ class CommonRowPartitioner {
AddSplitsToRowSet(nodes, p_tree);
}

auto const& Partitions() const { return row_set_collection_; }
[[nodiscard]] auto const& Partitions() const { return row_set_collection_; }

size_t Size() const {
[[nodiscard]] std::size_t Size() const {
return std::distance(row_set_collection_.begin(), row_set_collection_.end());
}

Expand All @@ -266,12 +262,29 @@ class CommonRowPartitioner {
[&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
}

void LeafPartition(Context const* ctx, RegTree const& tree,
linalg::TensorView<GradientPair const, 2> gpair,
std::vector<bst_node_t>* p_out_position) const {
if (gpair.Shape(1) > 1) {
partition_builder_.LeafPartition(
ctx, tree, this->Partitions(), p_out_position, [&](std::size_t idx) -> bool {
auto sample = gpair.Slice(idx, linalg::All());
return std::all_of(linalg::cbegin(sample), linalg::cend(sample),
[](GradientPair const& g) { return g.GetHess() - .0f == .0f; });
});
} else {
auto s = gpair.Slice(linalg::All(), 0);
partition_builder_.LeafPartition(
ctx, tree, this->Partitions(), p_out_position,
[&](std::size_t idx) -> bool { return s(idx).GetHess() - .0f == .0f; });
}
}
void LeafPartition(Context const* ctx, RegTree const& tree,
common::Span<GradientPair const> gpair,
std::vector<bst_node_t>* p_out_position) const {
partition_builder_.LeafPartition(
ctx, tree, this->Partitions(), p_out_position,
[&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
[&](std::size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
}

private:
Expand All @@ -281,6 +294,5 @@ class CommonRowPartitioner {
ColumnSplitHelper column_split_helper_;
};

} // namespace tree
} // namespace xgboost
} // namespace xgboost::tree
#endif // XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
16 changes: 8 additions & 8 deletions tests/cpp/common/test_partition_builder.cc
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
/**
* Copyright 2020-2023 by XGBoost contributors
*/
#include <gtest/gtest.h>
#include <vector>

#include <string>
#include <utility>
#include <vector>

#include "../../../src/common/row_set.h"
#include "../../../src/common/partition_builder.h"
#include "../../../src/common/row_set.h"
#include "../helpers.h"

namespace xgboost {
namespace common {

namespace xgboost::common {
TEST(PartitionBuilder, BasicTest) {
constexpr size_t kBlockSize = 16;
constexpr size_t kNodes = 5;
Expand Down Expand Up @@ -74,6 +76,4 @@ TEST(PartitionBuilder, BasicTest) {
ASSERT_EQ(n_right, (kBlockSize - rows_for_left_node[nid]) * tasks[nid]);
}
}

} // namespace common
} // namespace xgboost
} // namespace xgboost::common
Loading

0 comments on commit a093770

Please sign in to comment.