Skip to content

Commit

Permalink
Process parquet bools with microkernels (#17157)
Browse files Browse the repository at this point in the history
This adds support for the bool type to reading parquet microkernels.  Both plain (bit-packed) and RLE-encoded bool decode is supported, using separate code paths. This PR also massively reduces boilerplate code, as most of the template info needed is already encoded in the kernel mask.  Also the superfluous level_t template parameter on rle_run has been removed. And bools have been added to the parquet benchmarks. 

Performance: register count drops from 62 -> 56, both plain and RLE-encoded bool decoding are now 46% faster (uncompressed).  Reading sample customer data shows no change. NDS tests show no change.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - https://github.com/nvdbaranec
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #17157
  • Loading branch information
pmattione-nvidia authored Nov 7, 2024
1 parent 2db58d5 commit 5147882
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 424 deletions.
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/nvbench_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ enum class data_type : int32_t {
INTEGRAL = static_cast<int32_t>(type_group_id::INTEGRAL),
INTEGRAL_SIGNED = static_cast<int32_t>(type_group_id::INTEGRAL_SIGNED),
FLOAT = static_cast<int32_t>(type_group_id::FLOATING_POINT),
BOOL8 = static_cast<int32_t>(cudf::type_id::BOOL8),
DECIMAL = static_cast<int32_t>(type_group_id::FIXED_POINT),
TIMESTAMP = static_cast<int32_t>(type_group_id::TIMESTAMP),
DURATION = static_cast<int32_t>(type_group_id::DURATION),
Expand All @@ -44,6 +45,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
case data_type::INTEGRAL: return "INTEGRAL";
case data_type::INTEGRAL_SIGNED: return "INTEGRAL_SIGNED";
case data_type::FLOAT: return "FLOAT";
case data_type::BOOL8: return "BOOL8";
case data_type::DECIMAL: return "DECIMAL";
case data_type::TIMESTAMP: return "TIMESTAMP";
case data_type::DURATION: return "DURATION";
Expand Down
2 changes: 2 additions & 0 deletions cpp/benchmarks/io/parquet/parquet_reader_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ void BM_parquet_read_io_compression(nvbench::state& state)
{
auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::BOOL8),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
Expand Down Expand Up @@ -298,6 +299,7 @@ void BM_parquet_read_wide_tables_mixed(nvbench::state& state)

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::FLOAT,
data_type::BOOL8,
data_type::DECIMAL,
data_type::TIMESTAMP,
data_type::DURATION,
Expand Down
3 changes: 2 additions & 1 deletion cpp/benchmarks/io/parquet/parquet_reader_options.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -66,6 +66,7 @@ void BM_parquet_read_options(nvbench::state& state,
auto const data_types =
dtypes_for_column_selection(get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::BOOL8),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
Expand Down
3 changes: 3 additions & 0 deletions cpp/benchmarks/io/parquet/parquet_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ void BM_parq_write_io_compression(
{
auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::BOOL8),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
Expand Down Expand Up @@ -143,6 +144,7 @@ void BM_parq_write_varying_options(

auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::BOOL8),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::TIMESTAMP),
static_cast<int32_t>(data_type::DURATION),
Expand Down Expand Up @@ -181,6 +183,7 @@ void BM_parq_write_varying_options(

using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
data_type::FLOAT,
data_type::BOOL8,
data_type::DECIMAL,
data_type::TIMESTAMP,
data_type::DURATION,
Expand Down
Loading

0 comments on commit 5147882

Please sign in to comment.