From 4002a535e659b8328f61174a1a3b7e3f86a06833 Mon Sep 17 00:00:00 2001 From: Alejandro Valerio Date: Tue, 3 Feb 2026 16:59:00 -0600 Subject: [PATCH 1/2] - TEMPORARY BRANCH FOR TIMING ANNOTATIONS. --- src/processing/encryption_sequencer.cpp | 103 +++++++++++++++--- src/processing/encryptors/basic_encryptor.cpp | 83 ++++++++++---- src/processing/parquet_utils.cpp | 45 +++++++- 3 files changed, 194 insertions(+), 37 deletions(-) diff --git a/src/processing/encryption_sequencer.cpp b/src/processing/encryption_sequencer.cpp index 232861e..9d80c7c 100644 --- a/src/processing/encryption_sequencer.cpp +++ b/src/processing/encryption_sequencer.cpp @@ -30,6 +30,8 @@ #include #include #include +#include +#include using namespace dbps::external; using namespace dbps::enum_utils; @@ -42,6 +44,11 @@ namespace { constexpr const char* ENCRYPTION_MODE_KEY_DATA_PAGE = "encrypt_mode_data_page"; constexpr const char* ENCRYPTION_MODE_PER_BLOCK = "per_block"; constexpr const char* ENCRYPTION_MODE_PER_VALUE = "per_value"; + + bool ShouldLogStepTimings() { + const char* env = std::getenv("DBPS_LOG_ENCRYPT_TIMING"); + return env == nullptr || std::string(env) == "1"; + } } // Helper function to create encryptor instance @@ -126,6 +133,24 @@ bool DataBatchEncryptionSequencer::DecodeAndEncrypt(const std::vector& auto encryption_mode_key = GetEncryptionModeKey(); + // ++++++ FORCED PER-BLOCK ENCRYPTION ++++++ vvvvvv + if (false) { + std::cout << "+++++ FORCED PER-BLOCK ENCRYPTION +++++" << " datatype_: " << to_string(datatype_) << std::endl; + std::cout << "+++++ FORCED PER-BLOCK ENCRYPTION +++++" << " compression_: " << to_string(compression_) << std::endl; + std::cout << "+++++ FORCED PER-BLOCK ENCRYPTION +++++" << " encoding_: " << to_string(encoding_) << std::endl; + std::cout << "+++++ FORCED PER-BLOCK ENCRYPTION +++++" << " page_type_: " << std::get(encoding_attributes_converted_.at("page_type")) << std::endl; + encrypted_result_ = encryptor_->EncryptBlock(plaintext); + if (encrypted_result_.empty()) { + error_stage_ = "encryption"; + error_message_ = "Failed to encrypt data"; + return false; + } + encryption_metadata_[encryption_mode_key] = ENCRYPTION_MODE_PER_BLOCK; + encryption_metadata_[DBPS_VERSION_KEY] = DBPS_VERSION; + return true; + } + // ++++++ FORCED PER-BLOCK ENCRYPTION ++++++ ^^^^^^ + /* * Note on try-catch block: * - When fully done, DecodeAndEncrypt will support per-value encryption for all cases, except for @@ -137,24 +162,74 @@ bool DataBatchEncryptionSequencer::DecodeAndEncrypt(const std::vector& * - Once per-value encryption for all cases is complete, the try-catch block and the call to EncryptBlock must be removed. */ try { - // Decompress and split plaintext into level and value bytes - auto [level_bytes, value_bytes] = DecompressAndSplit( - plaintext, compression_, encoding_attributes_converted_); - - // Parse value bytes into typed list - auto typed_list = ParseValueBytesIntoTypedList(value_bytes, datatype_, datatype_length_, encoding_); - - // Encrypt the typed list and level bytes, then join them into a single encrypted byte vector. - auto encrypted_value_bytes = encryptor_->EncryptValueList(typed_list); - auto encrypted_level_bytes = encryptor_->EncryptBlock(level_bytes); - auto joined_encrypted_bytes = JoinWithLengthPrefix(encrypted_level_bytes, encrypted_value_bytes); - - // Compress the joined encrypted bytes - encrypted_result_ = Compress(joined_encrypted_bytes, encrypted_compression_); + const bool log_timings = ShouldLogStepTimings(); + using Clock = std::chrono::steady_clock; + std::vector> timings; + + std::vector level_bytes; + std::vector value_bytes; + TypedListValues typed_list; + std::vector encrypted_value_bytes; + std::vector encrypted_level_bytes; + std::vector joined_encrypted_bytes; + + auto time_step = [&](const char* label, const std::function& fn) { + if (!log_timings) { + fn(); + return; + } + auto start = Clock::now(); + fn(); + auto end = Clock::now(); + auto micros = std::chrono::duration_cast(end - start).count(); + timings.emplace_back(label, micros); + }; + + time_step("DecompressAndSplit", [&]() { + auto split = DecompressAndSplit(plaintext, compression_, encoding_attributes_converted_); + level_bytes = std::move(split.level_bytes); + value_bytes = std::move(split.value_bytes); + }); + + time_step("ParseValueBytesIntoTypedList", [&]() { + typed_list = ParseValueBytesIntoTypedList(value_bytes, datatype_, datatype_length_, encoding_); + }); + + time_step("EncryptValueList", [&]() { + encrypted_value_bytes = encryptor_->EncryptValueList(typed_list); + }); + + time_step("EncryptBlock(level_bytes)", [&]() { + encrypted_level_bytes = encryptor_->EncryptBlock(level_bytes); + }); + + time_step("JoinWithLengthPrefix", [&]() { + joined_encrypted_bytes = JoinWithLengthPrefix(encrypted_level_bytes, encrypted_value_bytes); + }); + + time_step("Compress", [&]() { + encrypted_result_ = Compress(joined_encrypted_bytes, encrypted_compression_); + }); // Set the encryption type to per-value encryption_metadata_[encryption_mode_key] = ENCRYPTION_MODE_PER_VALUE; encryption_metadata_[DBPS_VERSION_KEY] = DBPS_VERSION; + + std::cout << "+++++ PER-VALUE ENCRYPTION +++++" << " datatype_: " << to_string(datatype_) << std::endl; + std::cout << "+++++ PER-VALUE ENCRYPTION +++++" << " compression_: " << to_string(compression_) << std::endl; + std::cout << "+++++ PER-VALUE ENCRYPTION +++++" << " encoding_: " << to_string(encoding_) << std::endl; + std::cout << "+++++ PER-VALUE ENCRYPTION +++++" << " page_type_: " << std::get(encoding_attributes_converted_.at("page_type")) << std::endl; + std::cout << "+++++ PER-VALUE ENCRYPTION +++++" << " encrypted_compression_: " << to_string(encrypted_compression_) << std::endl; + const auto typed_list_size = std::visit([](const auto& values) { return values.size(); }, typed_list); + std::cout << "+++++ PER-VALUE ENCRYPTION +++++" << " typed_list size: " << typed_list_size << std::endl; + + if (log_timings) { + std::cout << "+++++ DecodeAndEncrypt timings (microseconds) +++++\n"; + for (const auto& entry : timings) { + std::cout << " " << entry.first << ": " << entry.second << "\n"; + } + } + return true; } // Allow fallback to per-block encryption, only for explicitly unsupported conditions. See note above. diff --git a/src/processing/encryptors/basic_encryptor.cpp b/src/processing/encryptors/basic_encryptor.cpp index 08ef9a9..481594d 100644 --- a/src/processing/encryptors/basic_encryptor.cpp +++ b/src/processing/encryptors/basic_encryptor.cpp @@ -20,6 +20,8 @@ #include "../../common/enum_utils.h" #include #include +#include +#include #include "../value_encryption_utils.h" using namespace dbps::value_encryption_utils; @@ -52,6 +54,17 @@ namespace { std::vector DecryptByteArray(const std::vector& data, const std::string& key_id) { return EncryptByteArray(data, key_id); // for XOR encryption, decryption is the same as encryption } + + bool ShouldLogValueEncryption() { + return false; + // const char* env = std::getenv("DBPS_LOG_VALUE_ENCRYPTION"); + // return env != nullptr && std::string(env) == "1"; + } + + bool ShouldLogValueEncryptionTiming() { + const char* env = std::getenv("DBPS_LOG_VALUE_ENCRYPT_TIMING"); + return env == nullptr || std::string(env) == "1"; + } } std::vector BasicEncryptor::EncryptBlock(const std::vector& data) { @@ -66,23 +79,40 @@ std::vector BasicEncryptor::DecryptBlock(const std::vector& da std::vector BasicEncryptor::EncryptValueList( const TypedListValues& typed_list) { - // Printout the typed list. - auto print_result = TypedListToString(typed_list); - if (print_result.length() > 1000) { - std::cout << "Encrypt value - Decoded plaintext data (first 1000 chars):\n" - << print_result.substr(0, 1000) << "..."; - } else { - std::cout << "Encrypt value - Decoded plaintext data:\n" << print_result; - } + const bool log_timings = ShouldLogValueEncryptionTiming(); + using Clock = std::chrono::steady_clock; + std::vector> timings; - // Printout the additional context parameters. - std::cout << "Context parameters:\n" - << " column_name: " << column_name_ << "\n" - << " user_id: " << user_id_ << "\n" - << " key_id: " << key_id_ << "\n" - << " application_context: " << application_context_ << "\n" - << " datatype: " << dbps::enum_utils::to_string(datatype_) << "\n" - << std::endl; + auto time_step = [&](const char* label, const std::function& fn) { + if (!log_timings) { + fn(); + return; + } + auto start = Clock::now(); + fn(); + auto end = Clock::now(); + auto micros = std::chrono::duration_cast(end - start).count(); + timings.emplace_back(label, micros); + }; + + if (ShouldLogValueEncryption()) { + // Printout the typed list. + auto print_result = TypedListToString(typed_list); + if (print_result.length() > 1000) { + std::cout << "Encrypt value - Decoded plaintext data (first 1000 chars):\n" + << print_result.substr(0, 1000) << "...\n"; + } else { + std::cout << "Encrypt value - Decoded plaintext data:\n" << print_result << "\n"; + } + + // Printout the additional context parameters. + std::cout << "Context parameters:\n" + << " column_name: " << column_name_ << "\n" + << " user_id: " << user_id_ << "\n" + << " key_id: " << key_id_ << "\n" + << " application_context: " << application_context_ << "\n" + << " datatype: " << dbps::enum_utils::to_string(datatype_) << "\n"; + } // create a closure for the encrypt function (to be used below) // the closure captures the key_bytes and calls the EncryptByteArray function. @@ -96,13 +126,26 @@ std::vector BasicEncryptor::EncryptValueList( // (1) encrypt the list of values. Each element in the list is encrypted separately // using the key and the EncryptByteArray function. - std::vector encrypted_values = EncryptTypedListValues( - typed_list, - encrypt_function); + std::vector encrypted_values; + time_step("EncryptTypedListValues", [&]() { + encrypted_values = EncryptTypedListValues( + typed_list, + encrypt_function); + }); // (2) concatenate the encrypted values into a single byte blob. // (the blob encodes #of elements and the size of each element) - std::vector concatenated_encrypted_bytes = ConcatenateEncryptedValues(encrypted_values); + std::vector concatenated_encrypted_bytes; + time_step("ConcatenateEncryptedValues", [&]() { + concatenated_encrypted_bytes = ConcatenateEncryptedValues(encrypted_values); + }); + + if (log_timings) { + std::cout << "EncryptValueList timings (microseconds):\n"; + for (const auto& entry : timings) { + std::cout << " " << entry.first << ": " << entry.second << "\n"; + } + } return concatenated_encrypted_bytes; } // EncryptValueList diff --git a/src/processing/parquet_utils.cpp b/src/processing/parquet_utils.cpp index ebb18ca..a24f1c8 100644 --- a/src/processing/parquet_utils.cpp +++ b/src/processing/parquet_utils.cpp @@ -20,11 +20,20 @@ #include "compression_utils.h" #include #include +#include +#include +#include using namespace dbps::external; using namespace dbps::enum_utils; using namespace dbps::compression; +namespace { + bool ShouldLogParseValueTiming() { + const char* env = std::getenv("DBPS_LOG_PARSE_VALUE_TIMING"); + return env == nullptr || std::string(env) == "1"; + } +} int CalculateLevelBytesLength(const std::vector& raw, const AttributesMap& encoding_attribs) { @@ -353,9 +362,39 @@ TypedListValues ParseValueBytesIntoTypedList( Type::type datatype, const std::optional& datatype_length, Encoding::type encoding) { - std::vector raw_values = - SliceValueBytesIntoRawBytes(bytes, datatype, datatype_length, encoding); - return BuildTypedListFromRawBytes(datatype, raw_values); + const bool log_timings = ShouldLogParseValueTiming(); + using Clock = std::chrono::steady_clock; + std::vector> timings; + + auto time_step = [&](const char* label, const std::function& fn) { + if (!log_timings) { + fn(); + return; + } + auto start = Clock::now(); + fn(); + auto end = Clock::now(); + auto micros = std::chrono::duration_cast(end - start).count(); + timings.emplace_back(label, micros); + }; + + std::vector raw_values; + time_step("SliceValueBytesIntoRawBytes", [&]() { + raw_values = SliceValueBytesIntoRawBytes(bytes, datatype, datatype_length, encoding); + }); + + TypedListValues typed_list; + time_step("BuildTypedListFromRawBytes", [&]() { + typed_list = BuildTypedListFromRawBytes(datatype, raw_values); + }); + + if (log_timings) { + std::cout << "ParseValueBytesIntoTypedList timings (microseconds):\n"; + for (const auto& entry : timings) { + std::cout << " " << entry.first << ": " << entry.second << "\n"; + } + } + return typed_list; } std::vector GetTypedListAsValueBytes( From 95d39e4d6423e361b06decf6a2f3df9e3bdac9ac Mon Sep 17 00:00:00 2001 From: Alejandro Valerio Date: Wed, 4 Feb 2026 12:10:17 -0600 Subject: [PATCH 2/2] - Quick-n-dirty refactor the BasicEncryptor to use a more efficient encryption algorithm --- src/processing/encryptors/basic_encryptor.cpp | 208 +++++++++++++++++- src/processing/encryptors/basic_encryptor.h | 4 + 2 files changed, 211 insertions(+), 1 deletion(-) diff --git a/src/processing/encryptors/basic_encryptor.cpp b/src/processing/encryptors/basic_encryptor.cpp index 481594d..64af51e 100644 --- a/src/processing/encryptors/basic_encryptor.cpp +++ b/src/processing/encryptors/basic_encryptor.cpp @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include "../value_encryption_utils.h" using namespace dbps::value_encryption_utils; @@ -65,6 +67,22 @@ namespace { const char* env = std::getenv("DBPS_LOG_VALUE_ENCRYPT_TIMING"); return env == nullptr || std::string(env) == "1"; } + + void EncryptBytesInto(const uint8_t* data, size_t len, const std::string& key_id, uint8_t* out) { + if (len == 0) { + return; + } + if (key_id.empty()) { + throw std::invalid_argument("EncryptBytesInto: key must not be empty for non-empty data"); + } + + std::hash hasher; + size_t key_hash = hasher(key_id); + for (size_t i = 0; i < len; ++i) { + out[i] = data[i] ^ (key_hash & 0xFF); + key_hash = (key_hash << 1) | (key_hash >> 31); + } + } } std::vector BasicEncryptor::EncryptBlock(const std::vector& data) { @@ -76,7 +94,7 @@ std::vector BasicEncryptor::DecryptBlock(const std::vector& da return DecryptByteArray(data, key_id_); } -std::vector BasicEncryptor::EncryptValueList( +std::vector BasicEncryptor::EncryptValueList_OLD( const TypedListValues& typed_list) { const bool log_timings = ShouldLogValueEncryptionTiming(); @@ -150,6 +168,194 @@ std::vector BasicEncryptor::EncryptValueList( return concatenated_encrypted_bytes; } // EncryptValueList +std::vector BasicEncryptor::EncryptValueList( + const TypedListValues& typed_list) { + + const bool log_timings = ShouldLogValueEncryptionTiming(); + using Clock = std::chrono::steady_clock; + std::vector> timings; + + auto time_step = [&](const char* label, const std::function& fn) { + if (!log_timings) { + fn(); + return; + } + auto start = Clock::now(); + fn(); + auto end = Clock::now(); + auto micros = std::chrono::duration_cast(end - start).count(); + timings.emplace_back(label, micros); + }; + + if (ShouldLogValueEncryption()) { + // Printout the typed list. + auto print_result = TypedListToString(typed_list); + if (print_result.length() > 1000) { + std::cout << "Encrypt value - Decoded plaintext data (first 1000 chars):\n" + << print_result.substr(0, 1000) << "...\n"; + } else { + std::cout << "Encrypt value - Decoded plaintext data:\n" << print_result << "\n"; + } + + // Printout the additional context parameters. + std::cout << "Context parameters:\n" + << " column_name: " << column_name_ << "\n" + << " user_id: " << user_id_ << "\n" + << " key_id: " << key_id_ << "\n" + << " application_context: " << application_context_ << "\n" + << " datatype: " << dbps::enum_utils::to_string(datatype_) << "\n"; + } + + const std::string key_id_copy = key_id_; + std::vector concatenated_encrypted_bytes; + + size_t total_capacity = 0; + uint32_t count = 0; + + time_step("ComputeEncryptedSize", [&]() { + std::visit([&](const auto& vec) { + using VecT = std::decay_t; + using ElemT = typename VecT::value_type; + if (vec.size() > static_cast(std::numeric_limits::max())) { + throw InvalidInputException("Too many elements to serialize into uint32 count"); + } + count = static_cast(vec.size()); + total_capacity = 4; + for (size_t i = 0; i < vec.size(); ++i) { + size_t elem_size = 0; + if constexpr (std::is_same_v || std::is_same_v) { + elem_size = 4; + } else if constexpr (std::is_same_v || std::is_same_v) { + elem_size = 8; + } else if constexpr (std::is_same_v>) { + elem_size = 12; + } else if constexpr (std::is_same_v) { + elem_size = vec[i].size(); + } else { + static_assert(sizeof(ElemT) == 0, "Unsupported element type in TypedListValues"); + } + + if (elem_size > static_cast(std::numeric_limits::max())) { + throw InvalidInputException("Element size exceeds uint32 capacity"); + } + total_capacity += 4 + elem_size; + } + }, typed_list); + }); + + time_step("EncryptIntoBuffer", [&]() { + concatenated_encrypted_bytes.resize(total_capacity); + size_t offset = 0; + auto write_u32_le = [&](uint32_t v) { + concatenated_encrypted_bytes[offset + 0] = static_cast(v & 0xFF); + concatenated_encrypted_bytes[offset + 1] = static_cast((v >> 8) & 0xFF); + concatenated_encrypted_bytes[offset + 2] = static_cast((v >> 16) & 0xFF); + concatenated_encrypted_bytes[offset + 3] = static_cast((v >> 24) & 0xFF); + offset += 4; + }; + + write_u32_le(count); + + std::visit([&](const auto& vec) { + using VecT = std::decay_t; + using ElemT = typename VecT::value_type; + + for (size_t i = 0; i < vec.size(); ++i) { + size_t elem_size = 0; + if constexpr (std::is_same_v) { + elem_size = 4; + write_u32_le(static_cast(elem_size)); + uint8_t raw[4]; + const uint32_t v = static_cast(vec[i]); + raw[0] = static_cast(v & 0xFF); + raw[1] = static_cast((v >> 8) & 0xFF); + raw[2] = static_cast((v >> 16) & 0xFF); + raw[3] = static_cast((v >> 24) & 0xFF); + EncryptBytesInto(raw, elem_size, key_id_copy, concatenated_encrypted_bytes.data() + offset); + offset += elem_size; + } else if constexpr (std::is_same_v) { + elem_size = 8; + write_u32_le(static_cast(elem_size)); + uint8_t raw[8]; + const uint64_t v = static_cast(vec[i]); + raw[0] = static_cast(v & 0xFF); + raw[1] = static_cast((v >> 8) & 0xFF); + raw[2] = static_cast((v >> 16) & 0xFF); + raw[3] = static_cast((v >> 24) & 0xFF); + raw[4] = static_cast((v >> 32) & 0xFF); + raw[5] = static_cast((v >> 40) & 0xFF); + raw[6] = static_cast((v >> 48) & 0xFF); + raw[7] = static_cast((v >> 56) & 0xFF); + EncryptBytesInto(raw, elem_size, key_id_copy, concatenated_encrypted_bytes.data() + offset); + offset += elem_size; + } else if constexpr (std::is_same_v) { + elem_size = 4; + write_u32_le(static_cast(elem_size)); + uint32_t bits = 0; + std::memcpy(&bits, &vec[i], sizeof(bits)); + uint8_t raw[4]; + raw[0] = static_cast(bits & 0xFF); + raw[1] = static_cast((bits >> 8) & 0xFF); + raw[2] = static_cast((bits >> 16) & 0xFF); + raw[3] = static_cast((bits >> 24) & 0xFF); + EncryptBytesInto(raw, elem_size, key_id_copy, concatenated_encrypted_bytes.data() + offset); + offset += elem_size; + } else if constexpr (std::is_same_v) { + elem_size = 8; + write_u32_le(static_cast(elem_size)); + uint64_t bits = 0; + std::memcpy(&bits, &vec[i], sizeof(bits)); + uint8_t raw[8]; + raw[0] = static_cast(bits & 0xFF); + raw[1] = static_cast((bits >> 8) & 0xFF); + raw[2] = static_cast((bits >> 16) & 0xFF); + raw[3] = static_cast((bits >> 24) & 0xFF); + raw[4] = static_cast((bits >> 32) & 0xFF); + raw[5] = static_cast((bits >> 40) & 0xFF); + raw[6] = static_cast((bits >> 48) & 0xFF); + raw[7] = static_cast((bits >> 56) & 0xFF); + EncryptBytesInto(raw, elem_size, key_id_copy, concatenated_encrypted_bytes.data() + offset); + offset += elem_size; + } else if constexpr (std::is_same_v>) { + elem_size = 12; + write_u32_le(static_cast(elem_size)); + uint8_t raw[12]; + for (int j = 0; j < 3; ++j) { + const uint32_t w = vec[i][j]; + raw[j * 4 + 0] = static_cast(w & 0xFF); + raw[j * 4 + 1] = static_cast((w >> 8) & 0xFF); + raw[j * 4 + 2] = static_cast((w >> 16) & 0xFF); + raw[j * 4 + 3] = static_cast((w >> 24) & 0xFF); + } + EncryptBytesInto(raw, elem_size, key_id_copy, concatenated_encrypted_bytes.data() + offset); + offset += elem_size; + } else if constexpr (std::is_same_v) { + elem_size = vec[i].size(); + write_u32_le(static_cast(elem_size)); + if (elem_size > 0) { + EncryptBytesInto(reinterpret_cast(vec[i].data()), + elem_size, + key_id_copy, + concatenated_encrypted_bytes.data() + offset); + offset += elem_size; + } + } else { + static_assert(sizeof(ElemT) == 0, "Unsupported element type in TypedListValues"); + } + } + }, typed_list); + }); + + if (log_timings) { + std::cout << "EncryptValueList timings (microseconds):\n"; + for (const auto& entry : timings) { + std::cout << " " << entry.first << ": " << entry.second << "\n"; + } + } + + return concatenated_encrypted_bytes; +} // EncryptValueList + TypedListValues BasicEncryptor::DecryptValueList( const std::vector& encrypted_bytes) { diff --git a/src/processing/encryptors/basic_encryptor.h b/src/processing/encryptors/basic_encryptor.h index ce6a7d9..9fc9008 100644 --- a/src/processing/encryptors/basic_encryptor.h +++ b/src/processing/encryptors/basic_encryptor.h @@ -63,6 +63,10 @@ class DBPS_EXPORT BasicEncryptor : public DBPSEncryptor { std::vector EncryptValueList( const TypedListValues& typed_list) override; + // Legacy implementation kept for reference/perf comparison + std::vector EncryptValueList_OLD( + const TypedListValues& typed_list); + TypedListValues DecryptValueList( const std::vector& encrypted_bytes) override; };